Week 1-Visualization of Datasets¶
In [1]:
import plotly.express as px
import plotly.io as pio # Required to define a renderer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go # Used later for Sankey
In [ ]:
import pandas as pd
In [1]:
df = pd.read_excel("datasets/Large Numbers.xlsx")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[1], line 1 ----> 1 df = pd.read_excel("datasets/Large Numbers.xlsx") NameError: name 'pd' is not defined
In [13]:
print("--- First 5 Rows ---")
--- First 5 Rows ---
In [14]:
print(df.head())
Name Accuracy Time (total) Score (780) Score (%) \ 0 Abhishek Subba 0.691000 6951 748 0.958974 1 Abishek Adhikari 0.637108 4985 785 1.006410 2 Anjana Subba 0.820000 5311 846 1.084615 3 Arpan Rai 0.828077 5547 790 1.012821 4 Arpana Ghimirey 0.783438 4773 509 0.652564 Exercises started Trophies Easy Moderate Hard Last submission date 0 29 Gold 4 0 0 2025-10-22T14:53:12 1 30 Diamond 4 0 0 2025-08-18T11:21:05 2 33 Diamond 2 2 0 2025-09-10T13:22:29 3 29 Diamond 4 0 0 2025-08-09T18:04:17 4 21 Bronze 1 0 0 2025-10-22T12:40:02
In [21]:
sns.histplot(df['Trophies'], bins=50, kde=True, color='green')
Out[21]:
<Axes: xlabel='Trophies', ylabel='Count'>
In [33]:
import matplotlib.pyplot as plt
plt.hist(df["Trophies"], bins=10)
plt.xlabel("Trophies")
plt.ylabel("Number of Students")
plt.title("Distribution of Trophies")
plt.show()
In [39]:
plt.hist(df["Time (total)"], bins=10, color='orange', edgecolor='k')
plt.xlabel("Time (total)")
plt.ylabel("Number of Students")
plt.title("Distribution of Time (total)")
plt.show()
In [40]:
plt.scatter(df["Trophies"], df["Time (total)"], alpha=0.3)
plt.xlabel("Trophies")
plt.ylabel("Time (total)")
plt.title("Trophies vs Accuracy")
plt.show()
In [42]:
# Create a simple difficulty proxy: total non-easy questions
df["NonEasy"] = df["Moderate"] + df["Hard"]
# Or categorize:
df["Difficulty"] = pd.cut(df["NonEasy"],
bins=[-1, 0, 2, 10],
labels=["Easy-only", "Some Moderate", "Hard Attempted"])
df.boxplot(column="Accuracy", by="Difficulty", figsize=(7,5))
plt.title("Accuracy by Difficulty Attempted")
plt.suptitle("")
plt.xlabel("Difficulty Category")
plt.ylabel("Accuracy (%)")
plt.show()
Sankey Diagram¶
In [43]:
import pandas as pd
import plotly.graph_objects as go
# Load data (only needed columns)
df = pd.DataFrame([
[4,0,0],[4,0,0],[2,2,0],[4,0,0],[1,0,0],[1,0,0],[4,0,0],[4,0,0],
[2,3,0],[2,0,0],[4,0,0],[0,3,0],[1,0,0],[1,3,0],[0,2,0],[1,1,0],
[3,0,0],[3,1,1],[4,0,0],[3,1,0],[0,4,1],[4,0,0],[4,0,0],[0,3,0],
[1,2,0],[1,0,0],[2,0,0]
], columns=["Easy","Moderate","Hard"])
# Binary: did they attempt at least one?
df = (df > 0).astype(int)
# Count student flows
n_start = len(df)
n_easy = df["Easy"].sum()
n_mod_from_easy = ((df["Easy"] == 1) & (df["Moderate"] == 1)).sum()
n_hard_from_mod = ((df["Moderate"] == 1) & (df["Hard"] == 1)).sum()
n_mod_only = (df["Moderate"] & ~df["Easy"]).sum() # rare, but included
n_hard_only = (df["Hard"] & ~df["Moderate"]).sum()
# Nodes & links
labels = ["Start", "Easy", "Moderate", "Hard"]
source = [0, 0, 1, 2, 0] # Start→Easy, Start→Mod, Easy→Mod, Mod→Hard, Start→Hard
target = [1, 2, 2, 3, 3]
value = [n_easy, n_mod_only, n_mod_from_easy, n_hard_from_mod, n_hard_only]
fig = go.Figure(go.Sankey(node=dict(label=labels), link=dict(source=source, target=target, value=value)))
fig.update_layout(title="Student Flow by Difficulty Attempted", height=400)
fig.show()
In [44]:
import plotly.graph_objects as go
labels = ["Start", "Easy", "Moderate", "Hard"]
source = [0, 0, 1, 2] # Start→Easy, Start→Mod, Easy→Mod, Mod→Hard
target = [1, 2, 2, 3]
value = [22, 4, 6, 2]
fig = go.Figure(go.Sankey(
node=dict(label=labels, pad=15, thickness=20),
link=dict(source=source, target=target, value=value)
))
fig.update_layout(title="Student Difficulty Progression", height=400)
fig.show()
In [45]:
import plotly.graph_objects as go
# Trophy → [Easy, Moderate, Hard] totals (computed from your data)
bronze = [6, 6, 0] # 8 students
gold = [4, 0, 0] # 1 student
diamond = [66, 26, 2] # 18 students
# Nodes: 0=Start, 1=Bronze, 2=Gold, 3=Diamond, 4=Easy, 5=Moderate, 6=Hard
fig = go.Figure(go.Sankey(
node=dict(label=["Start","Bronze","Gold","Diamond","Easy","Mod","Hard"]),
link=dict(
source=[0,0,0, 1,2,3, 1,3, 3],
target=[1,2,3, 4,4,4, 5,5, 6],
value =[8,1,18, 6,4,66, 6,26, 2]
)
))
fig.update_layout(height=400, title="Exercise Flow by Trophy")
fig.show()
In [ ]: