Probability Distribution¶
Investigating my dataset’s probability distribution involves examining the data closely to understand its shape and patterns. This helps me choose the most suitable mathematical model and decide whether basic measures like the mean and variability are sufficient or if more advanced techniques are required.
In [2]:
import pandas as pd
df = pd.read_excel("datasets/Large Numbers.xlsx") # load dataset
print(df.head())
Name Accuracy Time (total) Score (780) Score (%) \ 0 Abhishek Subba 0.691000 6951 748 0.958974 1 Abishek Adhikari 0.637108 4985 785 1.006410 2 Anjana Subba 0.820000 5311 846 1.084615 3 Arpan Rai 0.828077 5547 790 1.012821 4 Arpana Ghimirey 0.783438 4773 509 0.652564 Exercises started Trophies Easy Moderate Hard Last submission date 0 29 Gold 4 0 0 2025-10-22T14:53:12 1 30 Diamond 4 0 0 2025-08-18T11:21:05 2 33 Diamond 2 2 0 2025-09-10T13:22:29 3 29 Diamond 4 0 0 2025-08-09T18:04:17 4 21 Bronze 1 0 0 2025-10-22T12:40:02
In [3]:
print(df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 28 entries, 0 to 27 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 28 non-null object 1 Accuracy 28 non-null float64 2 Time (total) 28 non-null int64 3 Score (780) 28 non-null int64 4 Score (%) 28 non-null float64 5 Exercises started 28 non-null int64 6 Trophies 28 non-null object 7 Easy 28 non-null int64 8 Moderate 28 non-null int64 9 Hard 28 non-null int64 10 Last submission date 27 non-null object dtypes: float64(2), int64(6), object(3) memory usage: 2.5+ KB None
In [4]:
print(df.describe())
Accuracy Time (total) Score (780) Score (%) Exercises started \
count 28.000000 28.000000 28.000000 28.000000 28.000000
mean 0.723004 6791.785714 679.678571 0.871383 27.000000
std 0.159138 3536.149186 197.897290 0.253714 6.738502
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.700578 4932.000000 499.750000 0.640705 22.750000
50% 0.756880 5519.000000 784.000000 1.005128 30.000000
75% 0.804888 8148.250000 791.000000 1.014103 31.000000
max 0.857273 16432.000000 846.000000 1.084615 33.000000
Easy Moderate Hard
count 28.000000 28.000000 28.000000
mean 2.142857 0.892857 0.071429
std 1.556689 1.286375 0.262265
min 0.000000 0.000000 0.000000
25% 1.000000 0.000000 0.000000
50% 2.000000 0.000000 0.000000
75% 4.000000 2.000000 0.000000
max 4.000000 4.000000 1.000000
In [5]:
import pandas as pd
# Parse the table from Large Numbers.pdf
data = [
["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]
df = pd.DataFrame(data, columns=[
"Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
"Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])
# Clean 'Score (%)' (remove '%', convert to float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)
# Compute mean and std
mean_score = df["Score (%)"].mean()
std_score = df["Score (%)"].std()
print("Mean score (%):", round(mean_score, 2))
print("Std score (%):", round(std_score, 2))
Mean score (%): 90.37 Std score (%): 19.12
In [6]:
import pandas as pd
import matplotlib.pyplot as plt
# Parse the markdown table from Large Numbers.pdf
data = [
["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]
# Create DataFrame
df = pd.DataFrame(data, columns=[
"Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
"Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])
# Clean the Score (%) column (remove % and convert to float)
df['Score (%)'] = df['Score (%)'].str.rstrip('%').astype(float)
# Create histogram
plt.figure(figsize=(10, 6))
plt.hist(df['Score (%)'], bins=15, density=True, alpha=0.6, color='skyblue', edgecolor='black')
plt.title("Histogram of Student Scores (%)", fontsize=14)
plt.xlabel("Score (%)", fontsize=12)
plt.ylabel("Density", fontsize=12)
plt.grid(axis='y', alpha=0.75, linestyle='--')
plt.show()
In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Parse the markdown table from Large Numbers.pdf
data = [
["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]
# Create DataFrame
df = pd.DataFrame(data, columns=[
"Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
"Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])
# Clean the Score (%) column (remove % and convert to float)
df['Score (%)'] = df['Score (%)'].str.rstrip('%').astype(float)
# Create KDE plot
plt.figure(figsize=(10, 6))
sns.kdeplot(df['Score (%)'], fill=True, color="red", alpha=0.5)
plt.title("Kernel Density Estimate of Student Scores (%)", fontsize=14)
plt.xlabel("Score (%)", fontsize=12)
plt.ylabel("Density", fontsize=12)
plt.grid(axis='y', alpha=0.3, linestyle='--')
plt.show()
In [9]:
import pandas as pd
# Parse the table from Large Numbers.pdf
data = [
["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]
# Create DataFrame
df = pd.DataFrame(data, columns=[
"Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
"Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])
# Clean percentage columns (remove % and convert to float)
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)
# Calculate covariance matrix between Accuracy and Score (%)
cov_matrix = df[['Accuracy', 'Score (%)']].cov()
print("Covariance matrix between Accuracy and Score (%):")
print(cov_matrix)
Covariance matrix between Accuracy and Score (%):
Accuracy Score (%)
Accuracy 54.493741 -14.938351
Score (%) -14.938351 365.589487
In [8]:
import pandas as pd
# Parse the table from Large Numbers.pdf
data = [
["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]
# Create DataFrame
df = pd.DataFrame(data, columns=[
"Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
"Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])
# Clean percentage columns (remove % and convert to float)
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)
# Calculate covariance between Accuracy and Score (%)
cov_value = df['Accuracy'].cov(df['Score (%)'])
print("Covariance between Accuracy and Score (%):", cov_value)
Covariance between Accuracy and Score (%): -14.938350854700879
In [10]:
import pandas as pd
import matplotlib.pyplot as plt
# Parse data from Large Numbers.pdf
data = [
["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]
# Create DataFrame
df = pd.DataFrame(data, columns=[
"Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
"Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])
# Clean numeric columns
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)
# Scatter plot: Accuracy vs Score (%)
plt.figure(figsize=(8, 6))
plt.scatter(df['Accuracy'], df['Score (%)'], alpha=0.6, color='steelblue', s=60, edgecolor='k')
plt.title("Scatter Plot: Accuracy vs Score (%)", fontsize=14)
plt.xlabel("Accuracy (%)", fontsize=12)
plt.ylabel("Score (%)", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi, voronoi_plot_2d
import time
from matplotlib.colors import ListedColormap
# Parse data from Large Numbers.pdf
data = [
["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]
# Create DataFrame
df = pd.DataFrame(data, columns=[
"Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
"Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])
# Clean percentage columns
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)
# Start timing
start_time = time.time()
# Prepare data for Voronoi diagram - using Accuracy and Score (%)
points = df[["Accuracy", "Score (%)"]].values
# Create Voronoi diagram
vor = Voronoi(points)
# End timing
end_time = time.time()
computation_time = end_time - start_time
print(f"Voronoi computation time: {computation_time:.6f} seconds")
# Create figure with appropriate size
fig = plt.figure(figsize=(12, 10))
# Plot Voronoi diagram
voronoi_plot_2d(vor, ax=plt.gca(), show_vertices=False, line_colors='darkblue',
line_width=1, point_size=15, line_alpha=0.8)
# Color regions by trophy type
trophy_colors = {'Bronze': '#CD7F32', 'Gold': '#FFD700', 'Diamond': '#B9F2FF'}
for i, region_index in enumerate(vor.point_region):
region = vor.regions[region_index]
if not -1 in region and len(region) > 0: # Skip infinite regions
polygon = [vor.vertices[i] for i in region]
trophy = df.iloc[i]["Trophies"]
plt.fill(*zip(*polygon), color=trophy_colors[trophy], alpha=0.3)
# Plot the original points with colored markers
for trophy, color in trophy_colors.items():
mask = df["Trophies"] == trophy
plt.scatter(df[mask]["Accuracy"], df[mask]["Score (%)"],
c=color, s=100, label=trophy, edgecolors='k', linewidth=1.5)
# Add labels for some notable points
notable_students = ["Yeshey Tshoki", "Tenzin Sonam Dolkar", "Najimul Mia", "Kuenga Rinchen"]
for student in notable_students:
idx = df[df["Name"] == student].index[0]
plt.annotate(student, (df.iloc[idx]["Accuracy"], df.iloc[idx]["Score (%)"]),
xytext=(5, 5), textcoords='offset points', fontsize=9)
# Customize the plot
plt.title("Voronoi Diagram: Student Performance Regions\n(Accuracy vs Score %)", fontsize=16, fontweight='bold')
plt.xlabel("Accuracy (%)", fontsize=12)
plt.ylabel("Score (%)", fontsize=12)
plt.legend(title="Trophies", fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
# Set axis limits with padding
x_min, x_max = df["Accuracy"].min() - 5, df["Accuracy"].max() + 5
y_min, y_max = df["Score (%)"].min() - 5, df["Score (%)"].max() + 5
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.tight_layout()
plt.savefig("student_performance_voronoi.png", dpi=300, bbox_inches='tight')
plt.show()
print("Voronoi diagram saved as 'student_performance_voronoi.png'")
Voronoi computation time: 0.001308 seconds
Voronoi diagram saved as 'student_performance_voronoi.png'
In [12]:
df.head()
Out[12]:
| Name | Accuracy | Time (total) | Score (780) | Score (%) | Exercises started | Trophies | Easy | Moderate | Hard | Last submission date | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Abhishek Subba | 69.10 | 6951 | 748 | 95.90 | 29 | Gold | 4 | 0 | 0 | 2025-10-22T14:53:12 |
| 1 | Abishek Adhikari | 63.71 | 4985 | 785 | 100.64 | 30 | Diamond | 4 | 0 | 0 | 2025-08-18T11:21:05 |
| 2 | Anjana Subba | 82.00 | 5311 | 846 | 108.46 | 33 | Diamond | 2 | 2 | 0 | 2025-09-10T13:22:29 |
| 3 | Arpan Rai | 82.81 | 5547 | 790 | 101.28 | 29 | Diamond | 4 | 0 | 0 | 2025-08-09T18:04:17 |
| 4 | Arpana Ghimirey | 78.34 | 4773 | 509 | 65.26 | 21 | Bronze | 1 | 0 | 0 | 2025-10-22T12:40:02 |
In [13]:
# --- 2. GENERATE/LOAD DATA (Using the Large Numbers dataset) ---
# We use the student performance data from Large Numbers.pdf
# This section replaces the 80,000 data point example with our actual dataset
import pandas as pd
import numpy as np
# Create DataFrame from the provided data
data = [
["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]
# Create the DataFrame
df = pd.DataFrame(data, columns=[
"Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
"Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])
# 2. Define the features for Density Estimation (Investigation)
# We use 'Accuracy' and 'Score (%)' as the two variables (X and Y)
# Extract and clean 'Accuracy' (X) - remove % symbol and convert to float
x = df['Accuracy'].str.rstrip('%').astype(float).values
# Extract and clean 'Score (%)' (Y) - remove % symbol and convert to float
y = df['Score (%)'].str.rstrip('%').astype(float).values
# 3. Verification step:
# Ensure both arrays have the exact same length
if len(x) != len(y):
min_len = min(len(x), len(y))
x = x[:min_len]
y = y[:min_len]
print(f"Warning: Arrays were unequal length and were trimmed to N={len(x)}.")
npts = len(x) # Define N based on the length of the data
print(f"Data successfully loaded. N={npts} data points.")
print(f"X (Accuracy) range: {np.min(x):.2f}% to {np.max(x):.2f}%")
print(f"Y (Score %) range: {np.min(y):.2f}% to {np.max(y):.2f}%")
Data successfully loaded. N=27 data points. X (Accuracy) range: 56.79% to 85.73% Y (Score %) range: 52.82% to 108.46%
In [14]:
import numpy as np
# --- 2b. DATA PREPROCESSING (Standardization) ---
# Calculate mean and standard deviation for Accuracy and Score (%)
accuracy_mean = np.mean(x)
accuracy_std = np.std(x)
score_mean = np.mean(y)
score_std = np.std(y)
# Perform standardization: (Data - Mean) / Standard Deviation
accuracy_standardized = (x - accuracy_mean) / accuracy_std
score_standardized = (y - score_mean) / score_std
# Reassign standardized data to x and y for subsequent calculations
x = accuracy_standardized
y = score_standardized
print("Data standardized (mean 0, variance 1).")
print(f"Accuracy - Mean: {accuracy_mean:.2f}%, Std: {accuracy_std:.2f}%")
print(f"Score (%) - Mean: {score_mean:.2f}%, Std: {score_std:.2f}%")
print(f"Standardized Accuracy range: {np.min(x):.2f} to {np.max(x):.2f}")
print(f"Standardized Score (%) range: {np.min(y):.2f} to {np.max(y):.2f}")
Data standardized (mean 0, variance 1). Accuracy - Mean: 74.98%, Std: 7.24% Score (%) - Mean: 90.37%, Std: 18.76% Standardized Accuracy range: -2.51 to 1.48 Standardized Score (%) range: -2.00 to 0.96
In [15]:
import numpy as np
from sklearn.cluster import KMeans
# --- 1. SETUP PARAMETERS ---
nclusters = 3 # Reasonable for 3 trophy types (Bronze, Gold, Diamond)
nsteps = 25 # Number of EM iterations
# --- 3. INITIALIZE MODEL PARAMETERS (Robust: K-Means++) ---
# Prepare stacked (x, y) data for clustering: Accuracy vs Score (%)
X_data = np.column_stack((x, y)) # x=Accuracy, y=Score (%) — assumed already standardized
# Robust initialization via K-Means++ (avoids poor random starts)
kmeans = KMeans(n_clusters=nclusters, n_init=10, random_state=42)
kmeans.fit(X_data)
# Initialize means using K-Means centroids
mux = kmeans.cluster_centers_[:, 0] # x-coordinates (Accuracy)
muy = kmeans.cluster_centers_[:, 1] # y-coordinates (Score %)
# Initialize log-likelihood tracking
log_likelihoods = []
# Initialize global variances (based on data spread)
varx = np.var(x) # Use sample variance instead of range-squared
vary = np.var(y)
# Initialize equal mixing proportions
pc = np.ones(nclusters) / nclusters # Uniform prior: p(c_m) = 1/K
# Optional: Initialize cluster-specific covariances (diagonal)
# For full GMM, you'd use Sigma_m (2×2), but here we start with isotropic approx:
sigmax = np.sqrt(varx) * np.ones(nclusters) # std per cluster (x)
sigmay = np.sqrt(vary) * np.ones(nclusters) # std per cluster (y)
print(f"✅ Initialized {nclusters} clusters with K-Means++")
print(f"Initial means (Accuracy, Score %):")
for i in range(nclusters):
print(f" Cluster {i+1}: μx={mux[i]:.3f}, μy={muy[i]:.3f}")
✅ Initialized 3 clusters with K-Means++ Initial means (Accuracy, Score %): Cluster 1: μx=0.563, μy=-1.664 Cluster 2: μx=-1.503, μy=0.126 Cluster 3: μx=0.376, μy=0.615
In [16]:
import numpy as np
import pandas as pd
# Load data (from your table)
data_lines = [
["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
# ... (all 27 rows — abbreviated for clarity)
["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"]
]
df = pd.DataFrame(data_lines, columns=[
"Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
"Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])
# Clean and convert
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)
# Use real data — no simulation
x = df["Accuracy"].values
y = df["Score (%)"].values
print(f"Loaded real data: N = {len(x)} points")
Loaded real data: N = 2 points
In [17]:
import numpy as np
import pandas as pd
# --- Load and prepare data from Large Numbers.pdf ---
data = [
["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]
df = pd.DataFrame(data, columns=[
"Name","Accuracy","Time (total)","Score (780)","Score (%)",
"Exercises started","Trophies","Easy","Moderate","Hard","Last submission date"
])
# Clean percentage columns
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)
# --- Extract variables for GMM (Accuracy vs Score %) ---
x = df["Accuracy"].values
y = df["Score (%)"].values
npts = len(x)
# --- 1. SETUP PARAMETERS ---
nclusters = 3 # Align with Bronze/Gold/Diamond
nsteps = 25 # EM iterations
# --- 2. INITIALIZE PARAMETERS (K-Means++ for robustness) ---
from sklearn.cluster import KMeans
X = np.column_stack((x, y))
kmeans = KMeans(n_clusters=nclusters, n_init=10, random_state=42).fit(X)
mux = kmeans.cluster_centers_[:, 0]
muy = kmeans.cluster_centers_[:, 1]
varx = np.var(x) * np.ones(nclusters) # per-cluster variance (isotropic init)
vary = np.var(y) * np.ones(nclusters)
pc = np.ones(nclusters) / nclusters
log_likelihoods = []
print("✅ Initialized GMM with K-Means++")
# --- 3. E-M ITERATION ---
for i in range(nsteps):
# Create broadcasting matrices (npts × nclusters)
xm = np.outer(x, np.ones(nclusters)) # shape: (27, 3)
ym = np.outer(y, np.ones(nclusters))
muxm = np.outer(np.ones(npts), mux) # (27, 3)
muym = np.outer(np.ones(npts), muy)
varxm = np.outer(np.ones(npts), varx) # FIXED: per-cluster varx
varym = np.outer(np.ones(npts), vary) # FIXED: was 'varx' → now 'vary'
pcm = np.outer(np.ones(npts), pc)
# E-STEP: Compute p(v|c) for isotropic Gaussian (independent x, y)
eps = 1e-6
px_given_c = (1 / np.sqrt(2 * np.pi * (varxm + eps))) * \
np.exp(- (xm - muxm)**2 / (2 * (varxm + eps)))
py_given_c = (1 / np.sqrt(2 * np.pi * (varym + eps))) * \
np.exp(- (ym - muym)**2 / (2 * (varym + eps)))
pvgc = px_given_c * py_given_c # p(v|c) = p(x|c)p(y|c)
# p(v,c) and p(c|v)
pvc = pvgc * pcm
pcgv = pvc / np.outer(np.sum(pvc, axis=1), np.ones(nclusters))
# Log-likelihood
marginal = np.sum(pvc, axis=1)
ll = np.sum(np.log(np.clip(marginal, 1e-100, None)))
log_likelihoods.append(ll)
# M-STEP: Update parameters
denominator = np.sum(pcgv, axis=0) + 1e-6 # shape (nclusters,) — safer than npts*pc
pc = np.sum(pcgv, axis=0) / npts
mux = np.sum(xm * pcgv, axis=0) / denominator
muy = np.sum(ym * pcgv, axis=0) / denominator
varx = 0.1 + np.sum((xm - muxm)**2 * pcgv, axis=0) / denominator
vary = 0.1 + np.sum((ym - muym)**2 * pcgv, axis=0) / denominator
print(f"Step {i+1:2d}/{nsteps}: Log-Likelihood = {ll:8.2f} | p(c) = {pc}")
print("\n✅ E-M completed.")
✅ Initialized GMM with K-Means++ Step 1/25: Log-Likelihood = -212.52 | p(c) = [0.48003949 0.25732708 0.26263343] Step 2/25: Log-Likelihood = -195.13 | p(c) = [0.51450568 0.26758203 0.21791229] Step 3/25: Log-Likelihood = -174.41 | p(c) = [0.53367637 0.25929634 0.20702729] Step 4/25: Log-Likelihood = -166.31 | p(c) = [0.52366618 0.25925926 0.21707456] Step 5/25: Log-Likelihood = -165.48 | p(c) = [0.5189195 0.25925926 0.22182124] Step 6/25: Log-Likelihood = -165.37 | p(c) = [0.51762196 0.25925926 0.22311878] Step 7/25: Log-Likelihood = -165.36 | p(c) = [0.51755295 0.25925926 0.22318779] Step 8/25: Log-Likelihood = -165.35 | p(c) = [0.51781692 0.25925926 0.22292382] Step 9/25: Log-Likelihood = -165.35 | p(c) = [0.51814388 0.25925926 0.22259686] Step 10/25: Log-Likelihood = -165.35 | p(c) = [0.51846338 0.25925926 0.22227736] Step 11/25: Log-Likelihood = -165.35 | p(c) = [0.51875924 0.25925926 0.2219815 ] Step 12/25: Log-Likelihood = -165.35 | p(c) = [0.51902914 0.25925926 0.2217116 ] Step 13/25: Log-Likelihood = -165.35 | p(c) = [0.51927422 0.25925926 0.22146652] Step 14/25: Log-Likelihood = -165.35 | p(c) = [0.51949632 0.25925926 0.22124442] Step 15/25: Log-Likelihood = -165.35 | p(c) = [0.5196974 0.25925926 0.22104334] Step 16/25: Log-Likelihood = -165.35 | p(c) = [0.51987929 0.25925926 0.22086145] Step 17/25: Log-Likelihood = -165.35 | p(c) = [0.52004371 0.25925926 0.22069703] Step 18/25: Log-Likelihood = -165.35 | p(c) = [0.52019225 0.25925926 0.22054849] Step 19/25: Log-Likelihood = -165.35 | p(c) = [0.52032638 0.25925926 0.22041437] Step 20/25: Log-Likelihood = -165.35 | p(c) = [0.52044742 0.25925926 0.22029332] Step 21/25: Log-Likelihood = -165.35 | p(c) = [0.52055661 0.25925926 0.22018413] Step 22/25: Log-Likelihood = -165.35 | p(c) = [0.52065506 0.25925926 0.22008568] Step 23/25: Log-Likelihood = -165.35 | p(c) = [0.52074381 0.25925926 0.21999693] Step 24/25: Log-Likelihood = -165.35 | p(c) = [0.52082378 0.25925926 0.21991697] Step 25/25: Log-Likelihood = -165.35 | p(c) = [0.52089581 0.25925926 0.21984493] ✅ E-M completed.
In [18]:
import matplotlib.pyplot as plt
# --- 4b. PLOT CONVERGENCE (GMM Log-Likelihood) ---
if log_likelihoods: # assuming `log_likelihoods` is from your EM loop
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(log_likelihoods) + 1), log_likelihoods,
marker='o', linestyle='-', color='steelblue', markersize=5, linewidth=2)
plt.title('Log-Likelihood Convergence of GMM (E-M Algorithm)', fontsize=14, fontweight='bold')
plt.xlabel('EM Iteration', fontsize=12)
plt.ylabel('Log-Likelihood $ \\mathcal{L}(\\theta) $', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
# Highlight final value
final_ll = log_likelihoods[-1]
plt.axhline(y=final_ll, color='gray', linestyle=':', linewidth=1)
plt.text(len(log_likelihoods), final_ll + 0.05 * abs(final_ll),
f'Final: {final_ll:.1f}',
fontsize=10, ha='right', va='bottom', color='gray')
plt.tight_layout()
plt.savefig("gmm_convergence.png", dpi=150, bbox_inches='tight')
plt.show()
print(f"✅ Convergence plot saved as 'gmm_convergence.png'")
else:
print("⚠️ No log-likelihood history found. Run the EM algorithm first.")
✅ Convergence plot saved as 'gmm_convergence.png'
In [19]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
# --- Load and clean data ---
data = [
["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]
df = pd.DataFrame(data, columns=[
"Name","Accuracy","Time (total)","Score (780)","Score (%)",
"Exercises started","Trophies","Easy","Moderate","Hard","Last submission date"
])
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)
# Extract data
x = df["Accuracy"].values
y = df["Score (%)"].values
# --- Fit simple 3-component GMM (isotropic, equal weights for demo) ---
# Use cluster centers by trophy
centers = df.groupby("Trophies")[["Accuracy", "Score (%)"]].mean()
sigmas = df.groupby("Trophies")[["Accuracy", "Score (%)"]].std()
# Define grid
x_min, x_max = x.min() - 5, x.max() + 5
y_min, y_max = y.min() - 5, y.max() + 5
X_grid = np.linspace(x_min, x_max, 100)
Y_grid = np.linspace(y_min, y_max, 100)
X, Y = np.meshgrid(X_grid, Y_grid)
pos = np.dstack((X, Y))
# Compute GMM density: p(x,y) = Σ_k π_k * N(x|μₖ,σₖ²) * N(y|νₖ,τₖ²)
p = np.zeros_like(X)
for trophy in ["Bronze", "Gold", "Diamond"]:
if trophy in centers.index:
mu_x = centers.loc[trophy, "Accuracy"]
mu_y = centers.loc[trophy, "Score (%)"]
sigma_x = sigmas.loc[trophy, "Accuracy"] or 5.0
sigma_y = sigmas.loc[trophy, "Score (%)"] or 8.0
weight = len(df[df["Trophies"] == trophy]) / len(df)
# Independent Gaussians (isotropic per axis)
gauss_x = np.exp(-0.5 * ((X - mu_x) / sigma_x)**2) / (sigma_x * np.sqrt(2 * np.pi))
gauss_y = np.exp(-0.5 * ((Y - mu_y) / sigma_y)**2) / (sigma_y * np.sqrt(2 * np.pi))
p += weight * gauss_x * gauss_y
# --- Plot 3D surface ---
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
# Surface
surf = ax.plot_surface(X, Y, p, cmap='viridis', alpha=0.9, linewidth=0, antialiased=True)
# Data points
ax.scatter(x, y, np.zeros_like(x), color='red', s=40, label='Students', alpha=0.8, edgecolor='k')
# Labels & styling
ax.set_xlabel('Accuracy (%)', fontsize=11)
ax.set_ylabel('Score (%)', fontsize=11)
ax.set_zlabel('Density', fontsize=11)
ax.set_title('Estimated Probability Distribution (3-Component GMM)', fontsize=14, pad=20)
ax.legend()
# Improve view angle
ax.view_init(elev=25, azim=-60)
# Color bar
fig.colorbar(surf, shrink=0.5, aspect=15, pad=0.1)
plt.tight_layout()
plt.savefig("gmm_3d_density.png", dpi=150, bbox_inches='tight')
plt.show()
In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# --- Load data from Large Numbers.pdf ---
data = [
["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]
df = pd.DataFrame(data, columns=[
"Name","Accuracy","Time (total)","Score (780)","Score (%)",
"Exercises started","Trophies","Easy","Moderate","Hard","Last submission date"
])
# Clean percentages
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)
# Define variables
x = df["Accuracy"].values
y = df["Score (%)"].values
npts = len(x)
# --- Simulate final EM output (pcgv, mux, muy) for demo ---
# In practice, these come from your EM loop (pcgv shape: npts × nclusters)
# Example: 3-cluster assignment — Bronze, Gold, Diamond
# Let’s assign clusters based on trophy for illustration
trophy_to_cluster = {"Bronze": 0, "Gold": 1, "Diamond": 2}
final_assignments = np.array([trophy_to_cluster[t] for t in df["Trophies"]])
# Cluster means (from real data)
centers = df.groupby("Trophies")[["Accuracy", "Score (%)"]].mean()
mux = centers["Accuracy"].values # [Bronze, Gold, Diamond]
muy = centers["Score (%)"].values
nclusters = len(centers)
# --- Plot cluster assignment ---
plt.figure(figsize=(10, 8))
# Scatter: real data, colored by cluster (0=Bronze, 1=Gold, 2=Diamond)
scatter = plt.scatter(x, y, c=final_assignments, cmap='Set1', s=80, alpha=0.8, edgecolor='k')
# Cluster centers
plt.scatter(mux, muy, c='black', marker='X', s=300, edgecolor='red', linewidth=2, label='Cluster Centers')
# Labels & styling
plt.title(f'Student Performance Clusters (K={nclusters})', fontsize=15)
plt.xlabel('Accuracy (%)', fontsize=13)
plt.ylabel('Score (%)', fontsize=13)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
# Add colorbar with cluster labels
cbar = plt.colorbar(scatter, ticks=[0, 1, 2])
cbar.set_ticklabels(['Bronze', 'Gold', 'Diamond'])
cbar.set_label('Trophy-Based Cluster', rotation=270, labelpad=20)
plt.tight_layout()
plt.savefig("gmm_cluster_assignment.png", dpi=150, bbox_inches='tight')
plt.show()
In [21]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# --- Load and parse data from Large Numbers.pdf ---
data = [
["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]
df = pd.DataFrame(data, columns=[
"Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
"Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])
# Clean percentage columns
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)
# Extract variables
x = df["Accuracy"].values
y = df["Score (%)"].values
# --- Compute cluster centers & std devs by Trophy (3 clusters) ---
grouped = df.groupby("Trophies")[["Accuracy", "Score (%)"]]
mux = grouped.mean()["Accuracy"].values # [Bronze, Diamond, Gold]
muy = grouped.mean()["Score (%)"].values
stdx = grouped.std()["Accuracy"].fillna(5.0).values # fallback std if only 1 sample (e.g., Gold)
stdy = grouped.std()["Score (%)"].fillna(8.0).values
# Ensure order: Bronze, Diamond, Gold (for consistent plotting)
order = ["Bronze", "Diamond", "Gold"]
mux = grouped.mean().loc[order, "Accuracy"].values
muy = grouped.mean().loc[order, "Score (%)"].values
stdx = grouped.std().loc[order, "Accuracy"].fillna(5.0).values
stdy = grouped.std().loc[order, "Score (%)"].fillna(8.0).values
# --- Plot ---
plt.figure(figsize=(9, 7))
plt.plot(x, y, 'o', color='steelblue', alpha=0.7, markersize=6, label='Students')
# Error bars: horizontal (±σ_x), vertical (±σ_y)
colors = ['#CD7F32', '#B9F2FF', '#FFD700'] # Bronze, Diamond, Gold
for i, trophy in enumerate(order):
plt.errorbar(
mux[i], muy[i],
xerr=stdx[i], yerr=stdy[i],
fmt='D', # diamond marker
color=colors[i],
ecolor=colors[i],
elinewidth=2,
capsize=5,
markersize=12,
markeredgecolor='k',
markeredgewidth=1.5,
label=f'{trophy} Center ± Std'
)
plt.title('Student Performance Clusters with Uncertainty (Trophy-Based)', fontsize=14)
plt.xlabel('Accuracy (%)', fontsize=12)
plt.ylabel('Score (%)', fontsize=12)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig("cluster_centers_with_errorbars.png", dpi=150, bbox_inches='tight')
plt.show()
In [ ]: