import plotly.express as px
import plotly.io as pio # Required to define a renderer 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.graph_objects as go # Used later for Sankey

file_path = "datasets/enhanced_student_habits_performance_dataset.csv" 
df = pd.read_csv(file_path)

df.head()

fig = px.scatter(df, x="motivation_level", y="exam_score", color="family_income_range")
fig.show()

 sns.histplot(df['screen_time'], bins=20, kde=True, color='skyblue')

<Axes: xlabel='screen_time', ylabel='Count'>

import matplotlib.pyplot as plt

plt.hist(df["study_hours_per_day"], bins=30)
plt.xlabel("Study Hours Per Day")
plt.ylabel("Number of Students")
plt.title("Distribution of Daily Study Hours")
plt.show()

df["major"].value_counts().plot(kind='bar')

plt.xlabel("Major")
plt.ylabel("Number of Students")
plt.title("Distribution of Majors in Dataset")
plt.show()

plt.scatter(df["study_hours_per_day"], df["exam_score"], alpha=0.3)
plt.xlabel("Study Hours Per Day")
plt.ylabel("Exam Score")
plt.title("Study Time vs Exam Performance")
plt.show()

df.boxplot(column="previous_gpa", by="gender", figsize=(7,5))
plt.title("GPA by Gender")
plt.suptitle("")
plt.xlabel("Gender")
plt.ylabel("Previous GPA")
plt.show()

# Create categories
df["mh_cat"] = pd.cut(df["mental_health_rating"], bins=[0,3,7,10], labels=["Low","Medium","High"])
df["stress_cat"] = pd.cut(df["stress_level"], bins=[0,3,7,10], labels=["Low","Medium","High"])
df["score_cat"] = pd.cut(df["exam_score"], bins=[0,50,75,100], labels=["Low","Medium","High"])

df.head()

bins = [0, 2, 4, 6, 8, 10, 12]
labels = ['0-2', '2.1-4', '4.1-6', '6.1-8', '8.1-10', '10.1+']
df['Screen time'] = pd.cut(df['screen_time'], bins=bins, labels=labels, right=False)

flow_df = df.groupby(
    ['parental_education_level', 'family_income_range', 'dropout_risk']
).size().reset_index(name='count')

labels = [
    "Low Edu",      # 0 - Stage 1
    "Medium Edu",   # 1 - Stage 1
    "High Edu",     # 2 - Stage 1
    "Low Income",   # 3 - Stage 2
    "Medium Income",# 4 - Stage 2
    "High Income",  # 5 - Stage 2
    "Low Risk",     # 6 - Stage 3
    "Medium Risk",  # 7 - Stage 3
    "High Risk"     # 8 - Stage 3
]

nodes = dict(
    label=labels,
    # You can set a color for each stage for better visualization
    color=[
        "rgba(255, 99, 71, 0.8)", "rgba(255, 99, 71, 0.8)", "rgba(255, 99, 71, 0.8)", # Stage 1 (Reddish)
        "rgba(60, 179, 113, 0.8)", "rgba(60, 179, 113, 0.8)", "rgba(60, 179, 113, 0.8)", # Stage 2 (Greenish)
        "rgba(65, 105, 225, 0.8)", "rgba(65, 105, 225, 0.8)", "rgba(65, 105, 225, 0.8)"  # Stage 3 (Blueish)
    ]
)

links = dict(
    # Stage 1 -> Stage 2
    source=[0, 0, 0, 1, 1, 1, 2, 2, 2,
            # Stage 2 -> Stage 3
            3, 3, 3, 4, 4, 4, 5, 5, 5],
    target=[3, 4, 5, 3, 4, 5, 3, 4, 5,
            # Stage 2 -> Stage 3
            6, 7, 8, 6, 7, 8, 6, 7, 8],
    value=[10, 5, 2, 5, 15, 10, 2, 8, 25,
           # Stage 2 -> Stage 3
           5, 8, 2, 10, 10, 10, 15, 5, 5],
    # Link colors can be set to match the source node color for visual clarity
    color=[
        "rgba(255, 99, 71, 0.5)", "rgba(255, 99, 71, 0.5)", "rgba(255, 99, 71, 0.5)",
        "rgba(255, 99, 71, 0.5)", "rgba(255, 99, 71, 0.5)", "rgba(255, 99, 71, 0.5)",
        "rgba(255, 99, 71, 0.5)", "rgba(255, 99, 71, 0.5)", "rgba(255, 99, 71, 0.5)",
        "rgba(60, 179, 113, 0.5)", "rgba(60, 179, 113, 0.5)", "rgba(60, 179, 113, 0.5)",
        "rgba(60, 179, 113, 0.5)", "rgba(60, 179, 113, 0.5)", "rgba(60, 179, 113, 0.5)",
        "rgba(60, 179, 113, 0.5)", "rgba(60, 179, 113, 0.5)", "rgba(60, 179, 113, 0.5)"
    ]
)

fig = go.Figure(data=[go.Sankey(
    node=nodes,
    link=links,
    arrangement="snap",
    valueformat=".0f",
    valuesuffix=" students"
)])

# 4. Update Layout (Optional)
fig.update_layout(
    title_text="Sequential Flow: Parental Education → Family Income → Dropout Risk",
    font_size=10
)

df.head()

start_point = 'learning_style'
mid_point = 'major'
last_point = 'mh_cat'
flow_df = df.groupby(
    [start_point, mid_point, last_point],
    observed=True
).size().reset_index(name='count')

all_nodes = pd.concat([
    flow_df[start_point], 
    flow_df[mid_point], 
    flow_df[last_point]
]).unique().tolist()

node_index = {node: i for i, node in enumerate(all_nodes)}
print(node_index)

{'Auditory': 0, 'Kinesthetic': 1, 'Reading': 2, 'Visual': 3, 'Arts': 4, 'Biology': 5, 'Business': 6, 'Computer Science': 7, 'Engineering': 8, 'Psychology': 9, 'Low': 10, 'Medium': 11, 'High': 12}

source_1 = flow_df[start_point].map(node_index)
target_1 = flow_df[mid_point].map(node_index)
value_1 = flow_df['count']

source_2 = flow_df[mid_point].map(node_index)
target_2 = flow_df[last_point].map(node_index)
value_2 = flow_df['count']

nodes = dict(
    label=all_nodes,
    # You can set a color for each stage for better visualization
    color=[
        "rgba(255, 99, 71, 0.8)", "rgba(255, 99, 71, 0.8)", "rgba(255, 99, 71, 0.8)", # Stage 1 (Reddish)
        "rgba(60, 179, 113, 0.8)", "rgba(60, 179, 113, 0.8)", "rgba(60, 179, 113, 0.8)", # Stage 2 (Greenish)
        "rgba(65, 105, 225, 0.8)", "rgba(65, 105, 225, 0.8)", "rgba(65, 105, 225, 0.8)"  # Stage 3 (Blueish)
    ]
)

final_source = []
ind = 0
for i in range(len(source_1)):
    final_source.append(source_1[i])
    ind = i
for k in range(len(source_2)):
    ind = ind + 1
    final_source.append(source_2[k])

final_target = []
ind = 0
for i in range(len(target_1)):
    final_target.append(target_1[i])
    ind = i
for k in range(len(target_2)):
    ind = ind + 1
    final_target.append(target_2[k])

final_value = []
ind = 0
for i in range(len(value_1)):
    final_value.append(value_1[i])
    ind = i
for k in range(len(value_2)):
    ind = ind + 1
    final_value.append(value_2[k])

links = dict(
    # Stage 1 -> Stage 2
    source= final_source,
            # Stage 2 -> Stage 3
    target= final_target,
            # Stage 2 -> Stage 3
    value= final_value, 
           # Stage 2 -> Stage 3
)

fig = go.Figure(data=[go.Sankey(
    node=nodes,
    link=links,
    arrangement="snap",
    valueformat=".0f",
    valuesuffix=" students"
)])

# 4. Update Layout (Optional)
fig.update_layout(
    title_text= start_point + "              -->            " + mid_point + "           -->            " + last_point,
    font_size=12
)

	student_id	age	gender	major	study_hours_per_day	social_media_hours	netflix_hours	part_time_job	attendance_percentage	sleep_hours	...	screen_time	study_environment	access_to_tutoring	family_income_range	parental_support_level	motivation_level	exam_anxiety_score	learning_style	time_management_score	exam_score
0	100000	26	Male	Computer Science	7.645367	3.0	0.1	Yes	70.3	6.2	...	10.9	Co-Learning Group	Yes	High	9	7	8	Reading	3.0	100
1	100001	28	Male	Arts	5.700000	0.5	0.4	No	88.4	7.2	...	8.3	Co-Learning Group	Yes	Low	7	2	10	Reading	6.0	99
2	100002	17	Male	Arts	2.400000	4.2	0.7	No	82.1	9.2	...	8.0	Library	Yes	High	3	9	6	Kinesthetic	7.6	98
3	100003	27	Other	Psychology	3.400000	4.6	2.3	Yes	79.3	4.2	...	11.7	Co-Learning Group	Yes	Low	5	3	10	Reading	3.2	100
4	100004	25	Female	Business	4.700000	0.8	2.7	Yes	62.9	6.5	...	9.4	Quiet Room	Yes	Medium	9	1	10	Reading	7.1	98

	student_id	age	gender	major	study_hours_per_day	social_media_hours	netflix_hours	part_time_job	attendance_percentage	sleep_hours	...	family_income_range	parental_support_level	motivation_level	exam_anxiety_score	learning_style	time_management_score	exam_score	mh_cat	stress_cat	score_cat
0	100000	26	Male	Computer Science	7.645367	3.0	0.1	Yes	70.3	6.2	...	High	9	7	8	Reading	3.0	100	Medium	Medium	High
1	100001	28	Male	Arts	5.700000	0.5	0.4	No	88.4	7.2	...	Low	7	2	10	Reading	6.0	99	Medium	Medium	High
2	100002	17	Male	Arts	2.400000	4.2	0.7	No	82.1	9.2	...	High	3	9	6	Kinesthetic	7.6	98	Medium	High	High
3	100003	27	Other	Psychology	3.400000	4.6	2.3	Yes	79.3	4.2	...	Low	5	3	10	Reading	3.2	100	High	Medium	High
4	100004	25	Female	Business	4.700000	0.8	2.7	Yes	62.9	6.5	...	Medium	9	1	10	Reading	7.1	98	High	Medium	High

	student_id	age	gender	major	study_hours_per_day	social_media_hours	netflix_hours	part_time_job	attendance_percentage	sleep_hours	...	parental_support_level	motivation_level	exam_anxiety_score	learning_style	time_management_score	exam_score	mh_cat	stress_cat	score_cat	Screen time
0	100000	26	Male	Computer Science	7.645367	3.0	0.1	Yes	70.3	6.2	...	9	7	8	Reading	3.0	100	Medium	Medium	High	10.1+
1	100001	28	Male	Arts	5.700000	0.5	0.4	No	88.4	7.2	...	7	2	10	Reading	6.0	99	Medium	Medium	High	8.1-10
2	100002	17	Male	Arts	2.400000	4.2	0.7	No	82.1	9.2	...	3	9	6	Kinesthetic	7.6	98	Medium	High	High	8.1-10
3	100003	27	Other	Psychology	3.400000	4.6	2.3	Yes	79.3	4.2	...	5	3	10	Reading	3.2	100	High	Medium	High	10.1+
4	100004	25	Female	Business	4.700000	0.8	2.7	Yes	62.9	6.5	...	9	1	10	Reading	7.1	98	High	Medium	High	8.1-10

Week 1 - 2nd Assignment¶

Imported all the required modules¶

Histogram Plot¶

Sankey Diagram¶

Mapping to Sankey source, target, value¶

Sankey Output¶

Juypter Page Unresponsive¶