import plotly.express as px
import plotly.io as pio # Required to define a renderer 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.graph_objects as go # Used later for Sankey

import pandas as pd

 df = pd.read_excel("datasets/Large Numbers.xlsx")

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[1], line 1
----> 1 df = pd.read_excel("datasets/Large Numbers.xlsx")

NameError: name 'pd' is not defined

print("--- First 5 Rows ---")

--- First 5 Rows ---

print(df.head())

               Name  Accuracy  Time (total)  Score (780)  Score (%)  \
0    Abhishek Subba  0.691000          6951          748   0.958974   
1  Abishek Adhikari  0.637108          4985          785   1.006410   
2      Anjana Subba  0.820000          5311          846   1.084615   
3         Arpan Rai  0.828077          5547          790   1.012821   
4   Arpana Ghimirey  0.783438          4773          509   0.652564   

   Exercises started Trophies  Easy  Moderate  Hard Last submission date  
0                 29     Gold     4         0     0  2025-10-22T14:53:12  
1                 30  Diamond     4         0     0  2025-08-18T11:21:05  
2                 33  Diamond     2         2     0  2025-09-10T13:22:29  
3                 29  Diamond     4         0     0  2025-08-09T18:04:17  
4                 21   Bronze     1         0     0  2025-10-22T12:40:02

 sns.histplot(df['Trophies'], bins=50, kde=True, color='green')

<Axes: xlabel='Trophies', ylabel='Count'>

import matplotlib.pyplot as plt

plt.hist(df["Trophies"], bins=10)
plt.xlabel("Trophies")
plt.ylabel("Number of Students")
plt.title("Distribution of Trophies")
plt.show()

plt.hist(df["Time (total)"], bins=10, color='orange', edgecolor='k')
plt.xlabel("Time (total)")
plt.ylabel("Number of Students")
plt.title("Distribution of Time (total)")
plt.show()

plt.scatter(df["Trophies"], df["Time (total)"], alpha=0.3)
plt.xlabel("Trophies")
plt.ylabel("Time (total)")
plt.title("Trophies vs Accuracy")
plt.show()

# Create a simple difficulty proxy: total non-easy questions
df["NonEasy"] = df["Moderate"] + df["Hard"]
# Or categorize:
df["Difficulty"] = pd.cut(df["NonEasy"], 
                          bins=[-1, 0, 2, 10], 
                          labels=["Easy-only", "Some Moderate", "Hard Attempted"])

df.boxplot(column="Accuracy", by="Difficulty", figsize=(7,5))
plt.title("Accuracy by Difficulty Attempted")
plt.suptitle("")
plt.xlabel("Difficulty Category")
plt.ylabel("Accuracy (%)")
plt.show()

import pandas as pd
import plotly.graph_objects as go

# Load data (only needed columns)
df = pd.DataFrame([
    [4,0,0],[4,0,0],[2,2,0],[4,0,0],[1,0,0],[1,0,0],[4,0,0],[4,0,0],
    [2,3,0],[2,0,0],[4,0,0],[0,3,0],[1,0,0],[1,3,0],[0,2,0],[1,1,0],
    [3,0,0],[3,1,1],[4,0,0],[3,1,0],[0,4,1],[4,0,0],[4,0,0],[0,3,0],
    [1,2,0],[1,0,0],[2,0,0]
], columns=["Easy","Moderate","Hard"])

# Binary: did they attempt at least one?
df = (df > 0).astype(int)

# Count student flows
n_start = len(df)
n_easy = df["Easy"].sum()
n_mod_from_easy = ((df["Easy"] == 1) & (df["Moderate"] == 1)).sum()
n_hard_from_mod = ((df["Moderate"] == 1) & (df["Hard"] == 1)).sum()
n_mod_only = (df["Moderate"] & ~df["Easy"]).sum()  # rare, but included
n_hard_only = (df["Hard"] & ~df["Moderate"]).sum()

# Nodes & links
labels = ["Start", "Easy", "Moderate", "Hard"]
source = [0, 0, 1, 2, 0]          # Start→Easy, Start→Mod, Easy→Mod, Mod→Hard, Start→Hard
target = [1, 2, 2, 3, 3]
value  = [n_easy, n_mod_only, n_mod_from_easy, n_hard_from_mod, n_hard_only]

fig = go.Figure(go.Sankey(node=dict(label=labels), link=dict(source=source, target=target, value=value)))
fig.update_layout(title="Student Flow by Difficulty Attempted", height=400)
fig.show()

import plotly.graph_objects as go

labels = ["Start", "Easy", "Moderate", "Hard"]
source = [0, 0, 1, 2]   # Start→Easy, Start→Mod, Easy→Mod, Mod→Hard
target = [1, 2, 2, 3]
value  = [22, 4, 6, 2]

fig = go.Figure(go.Sankey(
    node=dict(label=labels, pad=15, thickness=20),
    link=dict(source=source, target=target, value=value)
))
fig.update_layout(title="Student Difficulty Progression", height=400)
fig.show()

import plotly.graph_objects as go

# Trophy → [Easy, Moderate, Hard] totals (computed from your data)
bronze = [6, 6, 0]   # 8 students
gold   = [4, 0, 0]   # 1 student
diamond = [66, 26, 2]  # 18 students

# Nodes: 0=Start, 1=Bronze, 2=Gold, 3=Diamond, 4=Easy, 5=Moderate, 6=Hard
fig = go.Figure(go.Sankey(
    node=dict(label=["Start","Bronze","Gold","Diamond","Easy","Mod","Hard"]),
    link=dict(
        source=[0,0,0, 1,2,3, 1,3, 3],
        target=[1,2,3, 4,4,4, 5,5, 6],
        value =[8,1,18, 6,4,66, 6,26, 2]
    )
))
fig.update_layout(height=400, title="Exercise Flow by Trophy")
fig.show()

Week 1-Visualization of Datasets¶

Sankey Diagram¶