In [ ]:
# Tools
In [ ]:
## student_depression_dataset
In [2]:
import pandas as pd
from matplotlib import pyplot as plt
In [12]:
# Replace 'your_file.csv' with the name or path of your actual CSV file
data = pd.read_csv('~/work/rinchen-khandu/datasets/student_depression_dataset.csv')
data
Out[12]:
| id | Gender | Age | City | Profession | Academic Pressure | Work Pressure | CGPA | Study Satisfaction | Job Satisfaction | Sleep Duration | Dietary Habits | Degree | Have you ever had suicidal thoughts ? | Work/Study Hours | Financial Stress | Family History of Mental Illness | Depression | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | Male | 33.0 | Visakhapatnam | Student | 5.0 | 0.0 | 8.97 | 2.0 | 0.0 | '5-6 hours' | Healthy | B.Pharm | Yes | 3.0 | 1.0 | No | 1 |
| 1 | 8 | Female | 24.0 | Bangalore | Student | 2.0 | 0.0 | 5.90 | 5.0 | 0.0 | '5-6 hours' | Moderate | BSc | No | 3.0 | 2.0 | Yes | 0 |
| 2 | 26 | Male | 31.0 | Srinagar | Student | 3.0 | 0.0 | 7.03 | 5.0 | 0.0 | 'Less than 5 hours' | Healthy | BA | No | 9.0 | 1.0 | Yes | 0 |
| 3 | 30 | Female | 28.0 | Varanasi | Student | 3.0 | 0.0 | 5.59 | 2.0 | 0.0 | '7-8 hours' | Moderate | BCA | Yes | 4.0 | 5.0 | Yes | 1 |
| 4 | 32 | Female | 25.0 | Jaipur | Student | 4.0 | 0.0 | 8.13 | 3.0 | 0.0 | '5-6 hours' | Moderate | M.Tech | Yes | 1.0 | 1.0 | No | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 27896 | 140685 | Female | 27.0 | Surat | Student | 5.0 | 0.0 | 5.75 | 5.0 | 0.0 | '5-6 hours' | Unhealthy | 'Class 12' | Yes | 7.0 | 1.0 | Yes | 0 |
| 27897 | 140686 | Male | 27.0 | Ludhiana | Student | 2.0 | 0.0 | 9.40 | 3.0 | 0.0 | 'Less than 5 hours' | Healthy | MSc | No | 0.0 | 3.0 | Yes | 0 |
| 27898 | 140689 | Male | 31.0 | Faridabad | Student | 3.0 | 0.0 | 6.61 | 4.0 | 0.0 | '5-6 hours' | Unhealthy | MD | No | 12.0 | 2.0 | No | 0 |
| 27899 | 140690 | Female | 18.0 | Ludhiana | Student | 5.0 | 0.0 | 6.88 | 2.0 | 0.0 | 'Less than 5 hours' | Healthy | 'Class 12' | Yes | 10.0 | 5.0 | No | 1 |
| 27900 | 140699 | Male | 27.0 | Patna | Student | 4.0 | 0.0 | 9.24 | 1.0 | 0.0 | 'Less than 5 hours' | Healthy | BCA | Yes | 2.0 | 3.0 | Yes | 1 |
27901 rows × 18 columns
In [13]:
# data cleaning
In [14]:
data.head()
Out[14]:
| id | Gender | Age | City | Profession | Academic Pressure | Work Pressure | CGPA | Study Satisfaction | Job Satisfaction | Sleep Duration | Dietary Habits | Degree | Have you ever had suicidal thoughts ? | Work/Study Hours | Financial Stress | Family History of Mental Illness | Depression | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | Male | 33.0 | Visakhapatnam | Student | 5.0 | 0.0 | 8.97 | 2.0 | 0.0 | '5-6 hours' | Healthy | B.Pharm | Yes | 3.0 | 1.0 | No | 1 |
| 1 | 8 | Female | 24.0 | Bangalore | Student | 2.0 | 0.0 | 5.90 | 5.0 | 0.0 | '5-6 hours' | Moderate | BSc | No | 3.0 | 2.0 | Yes | 0 |
| 2 | 26 | Male | 31.0 | Srinagar | Student | 3.0 | 0.0 | 7.03 | 5.0 | 0.0 | 'Less than 5 hours' | Healthy | BA | No | 9.0 | 1.0 | Yes | 0 |
| 3 | 30 | Female | 28.0 | Varanasi | Student | 3.0 | 0.0 | 5.59 | 2.0 | 0.0 | '7-8 hours' | Moderate | BCA | Yes | 4.0 | 5.0 | Yes | 1 |
| 4 | 32 | Female | 25.0 | Jaipur | Student | 4.0 | 0.0 | 8.13 | 3.0 | 0.0 | '5-6 hours' | Moderate | M.Tech | Yes | 1.0 | 1.0 | No | 0 |
In [15]:
data.columns
Out[15]:
Index(['id', 'Gender', 'Age', 'City', 'Profession', 'Academic Pressure',
'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',
'Sleep Duration', 'Dietary Habits', 'Degree',
'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
'Financial Stress', 'Family History of Mental Illness', 'Depression'],
dtype='object')
In [16]:
data.describe()
Out[16]:
| id | Age | Academic Pressure | Work Pressure | CGPA | Study Satisfaction | Job Satisfaction | Work/Study Hours | Depression | |
|---|---|---|---|---|---|---|---|---|---|
| count | 27901.000000 | 27901.000000 | 27901.000000 | 27901.000000 | 27901.000000 | 27901.000000 | 27901.000000 | 27901.000000 | 27901.000000 |
| mean | 70442.149421 | 25.822300 | 3.141214 | 0.000430 | 7.656104 | 2.943837 | 0.000681 | 7.156984 | 0.585499 |
| std | 40641.175216 | 4.905687 | 1.381465 | 0.043992 | 1.470707 | 1.361148 | 0.044394 | 3.707642 | 0.492645 |
| min | 2.000000 | 18.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 35039.000000 | 21.000000 | 2.000000 | 0.000000 | 6.290000 | 2.000000 | 0.000000 | 4.000000 | 0.000000 |
| 50% | 70684.000000 | 25.000000 | 3.000000 | 0.000000 | 7.770000 | 3.000000 | 0.000000 | 8.000000 | 1.000000 |
| 75% | 105818.000000 | 30.000000 | 4.000000 | 0.000000 | 8.920000 | 4.000000 | 0.000000 | 10.000000 | 1.000000 |
| max | 140699.000000 | 59.000000 | 5.000000 | 5.000000 | 10.000000 | 5.000000 | 4.000000 | 12.000000 | 1.000000 |
In [17]:
data.isnull().sum()
Out[17]:
id 0 Gender 0 Age 0 City 0 Profession 0 Academic Pressure 0 Work Pressure 0 CGPA 0 Study Satisfaction 0 Job Satisfaction 0 Sleep Duration 0 Dietary Habits 0 Degree 0 Have you ever had suicidal thoughts ? 0 Work/Study Hours 0 Financial Stress 0 Family History of Mental Illness 0 Depression 0 dtype: int64
In [18]:
data.dropna(inplace=True)
In [19]:
data.dtypes
Out[19]:
id int64 Gender object Age float64 City object Profession object Academic Pressure float64 Work Pressure float64 CGPA float64 Study Satisfaction float64 Job Satisfaction float64 Sleep Duration object Dietary Habits object Degree object Have you ever had suicidal thoughts ? object Work/Study Hours float64 Financial Stress object Family History of Mental Illness object Depression int64 dtype: object
In [20]:
Q1 = data['Age'].quantile(0.25)
Q3 = data['Age'].quantile(0.75)
IQR = Q3 - Q1
data = data[~((data['Age'] < Q1 - 1.5 * IQR) | (data['Age'] > Q3 + 1.5 * IQR))]
In [21]:
data.Gender
Out[21]:
0 Male
1 Female
2 Male
3 Female
4 Female
...
27896 Female
27897 Male
27898 Male
27899 Female
27900 Male
Name: Gender, Length: 27889, dtype: object
In [22]:
data["Gender"].value_counts()
Out[22]:
Gender Male 15543 Female 12346 Name: count, dtype: int64
In [25]:
data.tail()
Out[25]:
| id | Gender | Age | City | Profession | Academic Pressure | Work Pressure | CGPA | Study Satisfaction | Job Satisfaction | Sleep Duration | Dietary Habits | Degree | Have you ever had suicidal thoughts ? | Work/Study Hours | Financial Stress | Family History of Mental Illness | Depression | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 27896 | 140685 | Female | 27.0 | Surat | Student | 5.0 | 0.0 | 5.75 | 5.0 | 0.0 | '5-6 hours' | Unhealthy | 'Class 12' | Yes | 7.0 | 1.0 | Yes | 0 |
| 27897 | 140686 | Male | 27.0 | Ludhiana | Student | 2.0 | 0.0 | 9.40 | 3.0 | 0.0 | 'Less than 5 hours' | Healthy | MSc | No | 0.0 | 3.0 | Yes | 0 |
| 27898 | 140689 | Male | 31.0 | Faridabad | Student | 3.0 | 0.0 | 6.61 | 4.0 | 0.0 | '5-6 hours' | Unhealthy | MD | No | 12.0 | 2.0 | No | 0 |
| 27899 | 140690 | Female | 18.0 | Ludhiana | Student | 5.0 | 0.0 | 6.88 | 2.0 | 0.0 | 'Less than 5 hours' | Healthy | 'Class 12' | Yes | 10.0 | 5.0 | No | 1 |
| 27900 | 140699 | Male | 27.0 | Patna | Student | 4.0 | 0.0 | 9.24 | 1.0 | 0.0 | 'Less than 5 hours' | Healthy | BCA | Yes | 2.0 | 3.0 | Yes | 1 |
Data Visualization¶
In [1]:
data.plot(kind = "line", title = "students depression", xlabel = "Work Pressure", ylabel = "Age")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[1], line 1 ----> 1 data.plot(kind = "line", title = "students depression", xlabel = "Work Pressure", ylabel = "Age") 2 data NameError: name 'data' is not defined
Scatter plot¶
In [41]:
data.plot(kind = "scatter", title = "students depression", x = "Age", y = "Profession")
Out[41]:
<Axes: title={'center': 'students depression'}, xlabel='Age', ylabel='Profession'>
In [31]:
import matplotlib.pyplot as plt
genders = data ["Gender"].unique()
for g in genders:
subset = data[data ["Gender"] == g]
plt.scatter(subset["Job Satisfaction"], subset["Work Pressure"], label = g)
plt.xlabel("Job Satisfaction")
plt.ylabel("Work Pressure")
plt.legend()
plt.show()
In [34]:
# Bar Graph
In [32]:
import matplotlib.pyplot as plt
data['Gender'].value_counts().plot(kind='bar')
plt.title("Number of Participants by Gender")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.show()
In [35]:
import matplotlib.pyplot as plt
data['Profession'].value_counts().plot(kind='bar')
plt.title('Profession Distribution')
plt.xlabel('Profession')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
In [33]:
profession_count = data['Profession'].value_counts()
plt.figure(figsize=(10, 6))
profession_count.plot(kind='bar')
plt.title('Number of People by Profession')
plt.xlabel('Profession')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
In [36]:
import pandas as pd
import matplotlib.pyplot as plt
grouped = data.groupby("Age")["Depression"].mean()
grouped.plot(kind = "bar")
plt.xlabel("Age")
plt.ylabel("Depression")
plt.title("Student Depression")
plt.show()
In [ ]:
In [ ]: