Dawa Tshering - Fab Futures - Data Science
Home About

Probability Distribution¶

Investigating my dataset’s probability distribution involves examining the data closely to understand its shape and patterns. This helps me choose the most suitable mathematical model and decide whether basic measures like the mean and variability are sufficient or if more advanced techniques are required.

In [2]:
import pandas as pd

 
df = pd.read_excel("datasets/Large Numbers.xlsx")  # load dataset
print(df.head())  
               Name  Accuracy  Time (total)  Score (780)  Score (%)  \
0    Abhishek Subba  0.691000          6951          748   0.958974   
1  Abishek Adhikari  0.637108          4985          785   1.006410   
2      Anjana Subba  0.820000          5311          846   1.084615   
3         Arpan Rai  0.828077          5547          790   1.012821   
4   Arpana Ghimirey  0.783438          4773          509   0.652564   

   Exercises started Trophies  Easy  Moderate  Hard Last submission date  
0                 29     Gold     4         0     0  2025-10-22T14:53:12  
1                 30  Diamond     4         0     0  2025-08-18T11:21:05  
2                 33  Diamond     2         2     0  2025-09-10T13:22:29  
3                 29  Diamond     4         0     0  2025-08-09T18:04:17  
4                 21   Bronze     1         0     0  2025-10-22T12:40:02  
In [3]:
print(df.info())        
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Name                  28 non-null     object 
 1   Accuracy              28 non-null     float64
 2   Time (total)          28 non-null     int64  
 3   Score (780)           28 non-null     int64  
 4   Score (%)             28 non-null     float64
 5   Exercises started     28 non-null     int64  
 6   Trophies              28 non-null     object 
 7   Easy                  28 non-null     int64  
 8   Moderate              28 non-null     int64  
 9   Hard                  28 non-null     int64  
 10  Last submission date  27 non-null     object 
dtypes: float64(2), int64(6), object(3)
memory usage: 2.5+ KB
None
In [4]:
print(df.describe())  
        Accuracy  Time (total)  Score (780)  Score (%)  Exercises started  \
count  28.000000     28.000000    28.000000  28.000000          28.000000   
mean    0.723004   6791.785714   679.678571   0.871383          27.000000   
std     0.159138   3536.149186   197.897290   0.253714           6.738502   
min     0.000000      0.000000     0.000000   0.000000           0.000000   
25%     0.700578   4932.000000   499.750000   0.640705          22.750000   
50%     0.756880   5519.000000   784.000000   1.005128          30.000000   
75%     0.804888   8148.250000   791.000000   1.014103          31.000000   
max     0.857273  16432.000000   846.000000   1.084615          33.000000   

            Easy   Moderate       Hard  
count  28.000000  28.000000  28.000000  
mean    2.142857   0.892857   0.071429  
std     1.556689   1.286375   0.262265  
min     0.000000   0.000000   0.000000  
25%     1.000000   0.000000   0.000000  
50%     2.000000   0.000000   0.000000  
75%     4.000000   2.000000   0.000000  
max     4.000000   4.000000   1.000000  
In [5]:
import pandas as pd

# Parse the table from Large Numbers.pdf
data = [
    ["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
    ["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
    ["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
    ["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
    ["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
    ["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
    ["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
    ["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
    ["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
    ["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
    ["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
    ["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
    ["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
    ["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
    ["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
    ["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
    ["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
    ["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
    ["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
    ["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
    ["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
    ["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
    ["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
    ["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
    ["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
    ["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
    ["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]

df = pd.DataFrame(data, columns=[
    "Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
    "Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])

# Clean 'Score (%)' (remove '%', convert to float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)

# Compute mean and std
mean_score = df["Score (%)"].mean()
std_score = df["Score (%)"].std()

print("Mean score (%):", round(mean_score, 2))
print("Std score (%):", round(std_score, 2))
Mean score (%): 90.37
Std score (%): 19.12
In [6]:
import pandas as pd
import matplotlib.pyplot as plt

# Parse the markdown table from Large Numbers.pdf
data = [
    ["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
    ["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
    ["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
    ["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
    ["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
    ["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
    ["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
    ["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
    ["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
    ["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
    ["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
    ["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
    ["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
    ["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
    ["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
    ["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
    ["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
    ["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
    ["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
    ["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
    ["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
    ["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
    ["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
    ["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
    ["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
    ["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
    ["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]

# Create DataFrame
df = pd.DataFrame(data, columns=[
    "Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)", 
    "Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])

# Clean the Score (%) column (remove % and convert to float)
df['Score (%)'] = df['Score (%)'].str.rstrip('%').astype(float)

# Create histogram
plt.figure(figsize=(10, 6))
plt.hist(df['Score (%)'], bins=15, density=True, alpha=0.6, color='skyblue', edgecolor='black')
plt.title("Histogram of Student Scores (%)", fontsize=14)
plt.xlabel("Score (%)", fontsize=12)
plt.ylabel("Density", fontsize=12)
plt.grid(axis='y', alpha=0.75, linestyle='--')
plt.show()
No description has been provided for this image
In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Parse the markdown table from Large Numbers.pdf
data = [
    ["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
    ["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
    ["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
    ["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
    ["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
    ["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
    ["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
    ["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
    ["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
    ["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
    ["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
    ["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
    ["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
    ["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
    ["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
    ["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
    ["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
    ["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
    ["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
    ["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
    ["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
    ["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
    ["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
    ["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
    ["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
    ["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
    ["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]

# Create DataFrame
df = pd.DataFrame(data, columns=[
    "Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)", 
    "Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])

# Clean the Score (%) column (remove % and convert to float)
df['Score (%)'] = df['Score (%)'].str.rstrip('%').astype(float)

# Create KDE plot
plt.figure(figsize=(10, 6))
sns.kdeplot(df['Score (%)'], fill=True, color="red", alpha=0.5)
plt.title("Kernel Density Estimate of Student Scores (%)", fontsize=14)
plt.xlabel("Score (%)", fontsize=12)
plt.ylabel("Density", fontsize=12)
plt.grid(axis='y', alpha=0.3, linestyle='--')
plt.show()
No description has been provided for this image
In [9]:
import pandas as pd

# Parse the table from Large Numbers.pdf
data = [
    ["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
    ["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
    ["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
    ["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
    ["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
    ["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
    ["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
    ["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
    ["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
    ["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
    ["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
    ["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
    ["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
    ["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
    ["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
    ["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
    ["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
    ["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
    ["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
    ["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
    ["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
    ["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
    ["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
    ["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
    ["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
    ["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
    ["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]

# Create DataFrame
df = pd.DataFrame(data, columns=[
    "Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
    "Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])

# Clean percentage columns (remove % and convert to float)
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)

# Calculate covariance matrix between Accuracy and Score (%)
cov_matrix = df[['Accuracy', 'Score (%)']].cov()
print("Covariance matrix between Accuracy and Score (%):")
print(cov_matrix)
Covariance matrix between Accuracy and Score (%):
            Accuracy   Score (%)
Accuracy   54.493741  -14.938351
Score (%) -14.938351  365.589487
In [8]:
import pandas as pd

# Parse the table from Large Numbers.pdf
data = [
    ["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
    ["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
    ["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
    ["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
    ["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
    ["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
    ["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
    ["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
    ["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
    ["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
    ["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
    ["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
    ["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
    ["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
    ["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
    ["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
    ["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
    ["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
    ["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
    ["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
    ["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
    ["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
    ["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
    ["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
    ["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
    ["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
    ["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]

# Create DataFrame
df = pd.DataFrame(data, columns=[
    "Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
    "Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])

# Clean percentage columns (remove % and convert to float)
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)

# Calculate covariance between Accuracy and Score (%)
cov_value = df['Accuracy'].cov(df['Score (%)'])
print("Covariance between Accuracy and Score (%):", cov_value)
Covariance between Accuracy and Score (%): -14.938350854700879
In [10]:
import pandas as pd
import matplotlib.pyplot as plt

# Parse data from Large Numbers.pdf
data = [
    ["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
    ["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
    ["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
    ["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
    ["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
    ["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
    ["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
    ["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
    ["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
    ["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
    ["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
    ["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
    ["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
    ["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
    ["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
    ["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
    ["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
    ["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
    ["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
    ["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
    ["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
    ["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
    ["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
    ["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
    ["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
    ["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
    ["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]

# Create DataFrame
df = pd.DataFrame(data, columns=[
    "Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
    "Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])

# Clean numeric columns
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)

# Scatter plot: Accuracy vs Score (%)
plt.figure(figsize=(8, 6))
plt.scatter(df['Accuracy'], df['Score (%)'], alpha=0.6, color='steelblue', s=60, edgecolor='k')
plt.title("Scatter Plot: Accuracy vs Score (%)", fontsize=14)
plt.xlabel("Accuracy (%)", fontsize=12)
plt.ylabel("Score (%)", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi, voronoi_plot_2d
import time
from matplotlib.colors import ListedColormap

# Parse data from Large Numbers.pdf
data = [
    ["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
    ["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
    ["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
    ["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
    ["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
    ["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
    ["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
    ["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
    ["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
    ["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
    ["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
    ["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
    ["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
    ["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
    ["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
    ["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
    ["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
    ["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
    ["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
    ["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
    ["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
    ["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
    ["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
    ["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
    ["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
    ["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
    ["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]

# Create DataFrame
df = pd.DataFrame(data, columns=[
    "Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
    "Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])

# Clean percentage columns
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)

# Start timing
start_time = time.time()

# Prepare data for Voronoi diagram - using Accuracy and Score (%)
points = df[["Accuracy", "Score (%)"]].values

# Create Voronoi diagram
vor = Voronoi(points)

# End timing
end_time = time.time()
computation_time = end_time - start_time
print(f"Voronoi computation time: {computation_time:.6f} seconds")

# Create figure with appropriate size
fig = plt.figure(figsize=(12, 10))

# Plot Voronoi diagram
voronoi_plot_2d(vor, ax=plt.gca(), show_vertices=False, line_colors='darkblue', 
                line_width=1, point_size=15, line_alpha=0.8)

# Color regions by trophy type
trophy_colors = {'Bronze': '#CD7F32', 'Gold': '#FFD700', 'Diamond': '#B9F2FF'}
for i, region_index in enumerate(vor.point_region):
    region = vor.regions[region_index]
    if not -1 in region and len(region) > 0:  # Skip infinite regions
        polygon = [vor.vertices[i] for i in region]
        trophy = df.iloc[i]["Trophies"]
        plt.fill(*zip(*polygon), color=trophy_colors[trophy], alpha=0.3)

# Plot the original points with colored markers
for trophy, color in trophy_colors.items():
    mask = df["Trophies"] == trophy
    plt.scatter(df[mask]["Accuracy"], df[mask]["Score (%)"], 
                c=color, s=100, label=trophy, edgecolors='k', linewidth=1.5)

# Add labels for some notable points
notable_students = ["Yeshey Tshoki", "Tenzin Sonam Dolkar", "Najimul Mia", "Kuenga Rinchen"]
for student in notable_students:
    idx = df[df["Name"] == student].index[0]
    plt.annotate(student, (df.iloc[idx]["Accuracy"], df.iloc[idx]["Score (%)"]),
                 xytext=(5, 5), textcoords='offset points', fontsize=9)

# Customize the plot
plt.title("Voronoi Diagram: Student Performance Regions\n(Accuracy vs Score %)", fontsize=16, fontweight='bold')
plt.xlabel("Accuracy (%)", fontsize=12)
plt.ylabel("Score (%)", fontsize=12)
plt.legend(title="Trophies", fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)

# Set axis limits with padding
x_min, x_max = df["Accuracy"].min() - 5, df["Accuracy"].max() + 5
y_min, y_max = df["Score (%)"].min() - 5, df["Score (%)"].max() + 5
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

plt.tight_layout()
plt.savefig("student_performance_voronoi.png", dpi=300, bbox_inches='tight')
plt.show()

print("Voronoi diagram saved as 'student_performance_voronoi.png'")
Voronoi computation time: 0.001308 seconds
No description has been provided for this image
Voronoi diagram saved as 'student_performance_voronoi.png'
In [12]:
df.head()
Out[12]:
Name Accuracy Time (total) Score (780) Score (%) Exercises started Trophies Easy Moderate Hard Last submission date
0 Abhishek Subba 69.10 6951 748 95.90 29 Gold 4 0 0 2025-10-22T14:53:12
1 Abishek Adhikari 63.71 4985 785 100.64 30 Diamond 4 0 0 2025-08-18T11:21:05
2 Anjana Subba 82.00 5311 846 108.46 33 Diamond 2 2 0 2025-09-10T13:22:29
3 Arpan Rai 82.81 5547 790 101.28 29 Diamond 4 0 0 2025-08-09T18:04:17
4 Arpana Ghimirey 78.34 4773 509 65.26 21 Bronze 1 0 0 2025-10-22T12:40:02
In [13]:
# --- 2. GENERATE/LOAD DATA (Using the Large Numbers dataset) ---
# We use the student performance data from Large Numbers.pdf
# This section replaces the 80,000 data point example with our actual dataset
import pandas as pd
import numpy as np

# Create DataFrame from the provided data
data = [
    ["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
    ["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
    ["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
    ["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
    ["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
    ["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
    ["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
    ["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
    ["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
    ["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
    ["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
    ["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
    ["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
    ["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
    ["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
    ["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
    ["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
    ["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
    ["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
    ["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
    ["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
    ["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
    ["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
    ["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
    ["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
    ["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
    ["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]

# Create the DataFrame
df = pd.DataFrame(data, columns=[
    "Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
    "Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])

# 2. Define the features for Density Estimation (Investigation)
# We use 'Accuracy' and 'Score (%)' as the two variables (X and Y)

# Extract and clean 'Accuracy' (X) - remove % symbol and convert to float
x = df['Accuracy'].str.rstrip('%').astype(float).values

# Extract and clean 'Score (%)' (Y) - remove % symbol and convert to float
y = df['Score (%)'].str.rstrip('%').astype(float).values

# 3. Verification step:
# Ensure both arrays have the exact same length
if len(x) != len(y):
    min_len = min(len(x), len(y))
    x = x[:min_len]
    y = y[:min_len]
    print(f"Warning: Arrays were unequal length and were trimmed to N={len(x)}.")

npts = len(x) # Define N based on the length of the data

print(f"Data successfully loaded. N={npts} data points.")
print(f"X (Accuracy) range: {np.min(x):.2f}% to {np.max(x):.2f}%")
print(f"Y (Score %) range: {np.min(y):.2f}% to {np.max(y):.2f}%")
Data successfully loaded. N=27 data points.
X (Accuracy) range: 56.79% to 85.73%
Y (Score %) range: 52.82% to 108.46%
In [14]:
import numpy as np

# --- 2b. DATA PREPROCESSING (Standardization) ---

# Calculate mean and standard deviation for Accuracy and Score (%)
accuracy_mean = np.mean(x)
accuracy_std = np.std(x)
score_mean = np.mean(y)
score_std = np.std(y)

# Perform standardization: (Data - Mean) / Standard Deviation
accuracy_standardized = (x - accuracy_mean) / accuracy_std
score_standardized = (y - score_mean) / score_std

# Reassign standardized data to x and y for subsequent calculations
x = accuracy_standardized
y = score_standardized

print("Data standardized (mean 0, variance 1).")
print(f"Accuracy - Mean: {accuracy_mean:.2f}%, Std: {accuracy_std:.2f}%")
print(f"Score (%) - Mean: {score_mean:.2f}%, Std: {score_std:.2f}%")
print(f"Standardized Accuracy range: {np.min(x):.2f} to {np.max(x):.2f}")
print(f"Standardized Score (%) range: {np.min(y):.2f} to {np.max(y):.2f}")
Data standardized (mean 0, variance 1).
Accuracy - Mean: 74.98%, Std: 7.24%
Score (%) - Mean: 90.37%, Std: 18.76%
Standardized Accuracy range: -2.51 to 1.48
Standardized Score (%) range: -2.00 to 0.96
In [15]:
import numpy as np
from sklearn.cluster import KMeans

# --- 1. SETUP PARAMETERS ---
nclusters = 3        # Reasonable for 3 trophy types (Bronze, Gold, Diamond)
nsteps = 25          # Number of EM iterations

# --- 3. INITIALIZE MODEL PARAMETERS (Robust: K-Means++) ---

# Prepare stacked (x, y) data for clustering: Accuracy vs Score (%)
X_data = np.column_stack((x, y))  # x=Accuracy, y=Score (%) — assumed already standardized

# Robust initialization via K-Means++ (avoids poor random starts)
kmeans = KMeans(n_clusters=nclusters, n_init=10, random_state=42)
kmeans.fit(X_data)

# Initialize means using K-Means centroids
mux = kmeans.cluster_centers_[:, 0]  # x-coordinates (Accuracy)
muy = kmeans.cluster_centers_[:, 1]  # y-coordinates (Score %)

# Initialize log-likelihood tracking
log_likelihoods = []

# Initialize global variances (based on data spread)
varx = np.var(x)  # Use sample variance instead of range-squared
vary = np.var(y)

# Initialize equal mixing proportions
pc = np.ones(nclusters) / nclusters  # Uniform prior: p(c_m) = 1/K

# Optional: Initialize cluster-specific covariances (diagonal)
# For full GMM, you'd use Sigma_m (2×2), but here we start with isotropic approx:
sigmax = np.sqrt(varx) * np.ones(nclusters)  # std per cluster (x)
sigmay = np.sqrt(vary) * np.ones(nclusters)  # std per cluster (y)

print(f"✅ Initialized {nclusters} clusters with K-Means++")
print(f"Initial means (Accuracy, Score %):")
for i in range(nclusters):
    print(f"  Cluster {i+1}: μx={mux[i]:.3f}, μy={muy[i]:.3f}")
✅ Initialized 3 clusters with K-Means++
Initial means (Accuracy, Score %):
  Cluster 1: μx=0.563, μy=-1.664
  Cluster 2: μx=-1.503, μy=0.126
  Cluster 3: μx=0.376, μy=0.615
In [16]:
import numpy as np
import pandas as pd

# Load data (from your table)
data_lines = [
    ["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
    # ... (all 27 rows — abbreviated for clarity)
    ["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"]
]

df = pd.DataFrame(data_lines, columns=[
    "Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
    "Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])

# Clean and convert
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)

# Use real data — no simulation
x = df["Accuracy"].values
y = df["Score (%)"].values

print(f"Loaded real data: N = {len(x)} points")
Loaded real data: N = 2 points
In [17]:
import numpy as np
import pandas as pd

# --- Load and prepare data from Large Numbers.pdf ---
data = [
    ["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
    ["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
    ["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
    ["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
    ["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
    ["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
    ["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
    ["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
    ["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
    ["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
    ["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
    ["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
    ["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
    ["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
    ["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
    ["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
    ["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
    ["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
    ["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
    ["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
    ["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
    ["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
    ["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
    ["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
    ["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
    ["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
    ["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]

df = pd.DataFrame(data, columns=[
    "Name","Accuracy","Time (total)","Score (780)","Score (%)",
    "Exercises started","Trophies","Easy","Moderate","Hard","Last submission date"
])

# Clean percentage columns
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)

# --- Extract variables for GMM (Accuracy vs Score %) ---
x = df["Accuracy"].values
y = df["Score (%)"].values
npts = len(x)

# --- 1. SETUP PARAMETERS ---
nclusters = 3      # Align with Bronze/Gold/Diamond
nsteps = 25        # EM iterations

# --- 2. INITIALIZE PARAMETERS (K-Means++ for robustness) ---
from sklearn.cluster import KMeans
X = np.column_stack((x, y))
kmeans = KMeans(n_clusters=nclusters, n_init=10, random_state=42).fit(X)
mux = kmeans.cluster_centers_[:, 0]
muy = kmeans.cluster_centers_[:, 1]
varx = np.var(x) * np.ones(nclusters)  # per-cluster variance (isotropic init)
vary = np.var(y) * np.ones(nclusters)
pc = np.ones(nclusters) / nclusters
log_likelihoods = []

print("✅ Initialized GMM with K-Means++")

# --- 3. E-M ITERATION ---
for i in range(nsteps):
    # Create broadcasting matrices (npts × nclusters)
    xm = np.outer(x, np.ones(nclusters))      # shape: (27, 3)
    ym = np.outer(y, np.ones(nclusters))
    muxm = np.outer(np.ones(npts), mux)      # (27, 3)
    muym = np.outer(np.ones(npts), muy)
    varxm = np.outer(np.ones(npts), varx)    # FIXED: per-cluster varx
    varym = np.outer(np.ones(npts), vary)    # FIXED: was 'varx' → now 'vary'
    pcm = np.outer(np.ones(npts), pc)

    # E-STEP: Compute p(v|c) for isotropic Gaussian (independent x, y)
    eps = 1e-6
    px_given_c = (1 / np.sqrt(2 * np.pi * (varxm + eps))) * \
                 np.exp(- (xm - muxm)**2 / (2 * (varxm + eps)))
    py_given_c = (1 / np.sqrt(2 * np.pi * (varym + eps))) * \
                 np.exp(- (ym - muym)**2 / (2 * (varym + eps)))
    pvgc = px_given_c * py_given_c  # p(v|c) = p(x|c)p(y|c)

    # p(v,c) and p(c|v)
    pvc = pvgc * pcm
    pcgv = pvc / np.outer(np.sum(pvc, axis=1), np.ones(nclusters))
    
    # Log-likelihood
    marginal = np.sum(pvc, axis=1)
    ll = np.sum(np.log(np.clip(marginal, 1e-100, None)))
    log_likelihoods.append(ll)

    # M-STEP: Update parameters
    denominator = np.sum(pcgv, axis=0) + 1e-6  # shape (nclusters,) — safer than npts*pc
    pc = np.sum(pcgv, axis=0) / npts
    mux = np.sum(xm * pcgv, axis=0) / denominator
    muy = np.sum(ym * pcgv, axis=0) / denominator
    varx = 0.1 + np.sum((xm - muxm)**2 * pcgv, axis=0) / denominator
    vary = 0.1 + np.sum((ym - muym)**2 * pcgv, axis=0) / denominator

    print(f"Step {i+1:2d}/{nsteps}: Log-Likelihood = {ll:8.2f}  |  p(c) = {pc}")

print("\n✅ E-M completed.")
✅ Initialized GMM with K-Means++
Step  1/25: Log-Likelihood =  -212.52  |  p(c) = [0.48003949 0.25732708 0.26263343]
Step  2/25: Log-Likelihood =  -195.13  |  p(c) = [0.51450568 0.26758203 0.21791229]
Step  3/25: Log-Likelihood =  -174.41  |  p(c) = [0.53367637 0.25929634 0.20702729]
Step  4/25: Log-Likelihood =  -166.31  |  p(c) = [0.52366618 0.25925926 0.21707456]
Step  5/25: Log-Likelihood =  -165.48  |  p(c) = [0.5189195  0.25925926 0.22182124]
Step  6/25: Log-Likelihood =  -165.37  |  p(c) = [0.51762196 0.25925926 0.22311878]
Step  7/25: Log-Likelihood =  -165.36  |  p(c) = [0.51755295 0.25925926 0.22318779]
Step  8/25: Log-Likelihood =  -165.35  |  p(c) = [0.51781692 0.25925926 0.22292382]
Step  9/25: Log-Likelihood =  -165.35  |  p(c) = [0.51814388 0.25925926 0.22259686]
Step 10/25: Log-Likelihood =  -165.35  |  p(c) = [0.51846338 0.25925926 0.22227736]
Step 11/25: Log-Likelihood =  -165.35  |  p(c) = [0.51875924 0.25925926 0.2219815 ]
Step 12/25: Log-Likelihood =  -165.35  |  p(c) = [0.51902914 0.25925926 0.2217116 ]
Step 13/25: Log-Likelihood =  -165.35  |  p(c) = [0.51927422 0.25925926 0.22146652]
Step 14/25: Log-Likelihood =  -165.35  |  p(c) = [0.51949632 0.25925926 0.22124442]
Step 15/25: Log-Likelihood =  -165.35  |  p(c) = [0.5196974  0.25925926 0.22104334]
Step 16/25: Log-Likelihood =  -165.35  |  p(c) = [0.51987929 0.25925926 0.22086145]
Step 17/25: Log-Likelihood =  -165.35  |  p(c) = [0.52004371 0.25925926 0.22069703]
Step 18/25: Log-Likelihood =  -165.35  |  p(c) = [0.52019225 0.25925926 0.22054849]
Step 19/25: Log-Likelihood =  -165.35  |  p(c) = [0.52032638 0.25925926 0.22041437]
Step 20/25: Log-Likelihood =  -165.35  |  p(c) = [0.52044742 0.25925926 0.22029332]
Step 21/25: Log-Likelihood =  -165.35  |  p(c) = [0.52055661 0.25925926 0.22018413]
Step 22/25: Log-Likelihood =  -165.35  |  p(c) = [0.52065506 0.25925926 0.22008568]
Step 23/25: Log-Likelihood =  -165.35  |  p(c) = [0.52074381 0.25925926 0.21999693]
Step 24/25: Log-Likelihood =  -165.35  |  p(c) = [0.52082378 0.25925926 0.21991697]
Step 25/25: Log-Likelihood =  -165.35  |  p(c) = [0.52089581 0.25925926 0.21984493]

✅ E-M completed.
In [18]:
import matplotlib.pyplot as plt

# --- 4b. PLOT CONVERGENCE (GMM Log-Likelihood) ---
if log_likelihoods:  # assuming `log_likelihoods` is from your EM loop
    plt.figure(figsize=(8, 6))
    plt.plot(range(1, len(log_likelihoods) + 1), log_likelihoods, 
             marker='o', linestyle='-', color='steelblue', markersize=5, linewidth=2)
    plt.title('Log-Likelihood Convergence of GMM (E-M Algorithm)', fontsize=14, fontweight='bold')
    plt.xlabel('EM Iteration', fontsize=12)
    plt.ylabel('Log-Likelihood $ \\mathcal{L}(\\theta) $', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    
    # Highlight final value
    final_ll = log_likelihoods[-1]
    plt.axhline(y=final_ll, color='gray', linestyle=':', linewidth=1)
    plt.text(len(log_likelihoods), final_ll + 0.05 * abs(final_ll), 
             f'Final: {final_ll:.1f}', 
             fontsize=10, ha='right', va='bottom', color='gray')
    
    plt.tight_layout()
    plt.savefig("gmm_convergence.png", dpi=150, bbox_inches='tight')
    plt.show()
    
    print(f"✅ Convergence plot saved as 'gmm_convergence.png'")
else:
    print("⚠️ No log-likelihood history found. Run the EM algorithm first.")
No description has been provided for this image
✅ Convergence plot saved as 'gmm_convergence.png'
In [19]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd

# --- Load and clean data ---
data = [
    ["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
    ["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
    ["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
    ["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
    ["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
    ["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
    ["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
    ["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
    ["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
    ["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
    ["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
    ["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
    ["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
    ["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
    ["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
    ["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
    ["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
    ["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
    ["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
    ["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
    ["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
    ["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
    ["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
    ["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
    ["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
    ["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
    ["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]

df = pd.DataFrame(data, columns=[
    "Name","Accuracy","Time (total)","Score (780)","Score (%)",
    "Exercises started","Trophies","Easy","Moderate","Hard","Last submission date"
])
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)

# Extract data
x = df["Accuracy"].values
y = df["Score (%)"].values

# --- Fit simple 3-component GMM (isotropic, equal weights for demo) ---
# Use cluster centers by trophy
centers = df.groupby("Trophies")[["Accuracy", "Score (%)"]].mean()
sigmas = df.groupby("Trophies")[["Accuracy", "Score (%)"]].std()

# Define grid
x_min, x_max = x.min() - 5, x.max() + 5
y_min, y_max = y.min() - 5, y.max() + 5
X_grid = np.linspace(x_min, x_max, 100)
Y_grid = np.linspace(y_min, y_max, 100)
X, Y = np.meshgrid(X_grid, Y_grid)
pos = np.dstack((X, Y))

# Compute GMM density: p(x,y) = Σ_k π_k * N(x|μₖ,σₖ²) * N(y|νₖ,τₖ²)
p = np.zeros_like(X)

for trophy in ["Bronze", "Gold", "Diamond"]:
    if trophy in centers.index:
        mu_x = centers.loc[trophy, "Accuracy"]
        mu_y = centers.loc[trophy, "Score (%)"]
        sigma_x = sigmas.loc[trophy, "Accuracy"] or 5.0
        sigma_y = sigmas.loc[trophy, "Score (%)"] or 8.0
        weight = len(df[df["Trophies"] == trophy]) / len(df)
        
        # Independent Gaussians (isotropic per axis)
        gauss_x = np.exp(-0.5 * ((X - mu_x) / sigma_x)**2) / (sigma_x * np.sqrt(2 * np.pi))
        gauss_y = np.exp(-0.5 * ((Y - mu_y) / sigma_y)**2) / (sigma_y * np.sqrt(2 * np.pi))
        p += weight * gauss_x * gauss_y

# --- Plot 3D surface ---
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Surface
surf = ax.plot_surface(X, Y, p, cmap='viridis', alpha=0.9, linewidth=0, antialiased=True)

# Data points
ax.scatter(x, y, np.zeros_like(x), color='red', s=40, label='Students', alpha=0.8, edgecolor='k')

# Labels & styling
ax.set_xlabel('Accuracy (%)', fontsize=11)
ax.set_ylabel('Score (%)', fontsize=11)
ax.set_zlabel('Density', fontsize=11)
ax.set_title('Estimated Probability Distribution (3-Component GMM)', fontsize=14, pad=20)
ax.legend()

# Improve view angle
ax.view_init(elev=25, azim=-60)

# Color bar
fig.colorbar(surf, shrink=0.5, aspect=15, pad=0.1)

plt.tight_layout()
plt.savefig("gmm_3d_density.png", dpi=150, bbox_inches='tight')
plt.show()
No description has been provided for this image
In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# --- Load data from Large Numbers.pdf ---
data = [
    ["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
    ["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
    ["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
    ["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
    ["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
    ["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
    ["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
    ["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
    ["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
    ["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
    ["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
    ["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
    ["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
    ["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
    ["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
    ["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
    ["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
    ["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
    ["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
    ["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
    ["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
    ["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
    ["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
    ["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
    ["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
    ["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
    ["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]

df = pd.DataFrame(data, columns=[
    "Name","Accuracy","Time (total)","Score (780)","Score (%)",
    "Exercises started","Trophies","Easy","Moderate","Hard","Last submission date"
])

# Clean percentages
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)

# Define variables
x = df["Accuracy"].values
y = df["Score (%)"].values
npts = len(x)

# --- Simulate final EM output (pcgv, mux, muy) for demo ---
# In practice, these come from your EM loop (pcgv shape: npts × nclusters)

# Example: 3-cluster assignment — Bronze, Gold, Diamond
# Let’s assign clusters based on trophy for illustration
trophy_to_cluster = {"Bronze": 0, "Gold": 1, "Diamond": 2}
final_assignments = np.array([trophy_to_cluster[t] for t in df["Trophies"]])

# Cluster means (from real data)
centers = df.groupby("Trophies")[["Accuracy", "Score (%)"]].mean()
mux = centers["Accuracy"].values  # [Bronze, Gold, Diamond]
muy = centers["Score (%)"].values
nclusters = len(centers)

# --- Plot cluster assignment ---
plt.figure(figsize=(10, 8))

# Scatter: real data, colored by cluster (0=Bronze, 1=Gold, 2=Diamond)
scatter = plt.scatter(x, y, c=final_assignments, cmap='Set1', s=80, alpha=0.8, edgecolor='k')

# Cluster centers
plt.scatter(mux, muy, c='black', marker='X', s=300, edgecolor='red', linewidth=2, label='Cluster Centers')

# Labels & styling
plt.title(f'Student Performance Clusters (K={nclusters})', fontsize=15)
plt.xlabel('Accuracy (%)', fontsize=13)
plt.ylabel('Score (%)', fontsize=13)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)

# Add colorbar with cluster labels
cbar = plt.colorbar(scatter, ticks=[0, 1, 2])
cbar.set_ticklabels(['Bronze', 'Gold', 'Diamond'])
cbar.set_label('Trophy-Based Cluster', rotation=270, labelpad=20)

plt.tight_layout()
plt.savefig("gmm_cluster_assignment.png", dpi=150, bbox_inches='tight')
plt.show()
No description has been provided for this image
In [21]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# --- Load and parse data from Large Numbers.pdf ---
data = [
    ["Abhishek Subba", "69.10%", 6951, 748, "95.90%", 29, "Gold", 4, 0, 0, "2025-10-22T14:53:12"],
    ["Abishek Adhikari", "63.71%", 4985, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-08-18T11:21:05"],
    ["Anjana Subba", "82.00%", 5311, 846, "108.46%", 33, "Diamond", 2, 2, 0, "2025-09-10T13:22:29"],
    ["Arpan Rai", "82.81%", 5547, 790, "101.28%", 29, "Diamond", 4, 0, 0, "2025-08-09T18:04:17"],
    ["Arpana Ghimirey", "78.34%", 4773, 509, "65.26%", 21, "Bronze", 1, 0, 0, "2025-10-22T12:40:02"],
    ["Chimi Dolma Gurung", "70.38%", 4093, 468, "60.00%", 23, "Bronze", 1, 0, 0, "2025-10-01T12:20:50"],
    ["Dawa Kelwang Keltshok", "73.07%", 4601, 782, "100.26%", 31, "Diamond", 4, 0, 0, "2025-05-20T13:15:02"],
    ["Jamyang Gurung", "77.57%", 5469, 781, "100.13%", 30, "Diamond", 4, 0, 0, "2025-05-15T20:20:30"],
    ["Jamyang Tenzin Namgyel", "75.23%", 5180, 797, "102.18%", 30, "Diamond", 2, 3, 0, "2025-09-03T14:34:27"],
    ["Jigme Tenzin Wangpo", "75.83%", 5037, 782, "100.26%", 30, "Diamond", 2, 0, 0, "2025-10-22T08:31:26"],
    ["Karma Dema Chokey", "75.55%", 16432, 788, "101.03%", 30, "Diamond", 4, 0, 0, "2025-09-25T13:18:29"],
    ["Kishan Rai", "79.70%", 4460, 800, "102.56%", 31, "Diamond", 0, 3, 0, "2025-09-29T12:12:10"],
    ["Kuenga Rinchen", "62.28%", 9502, 451, "57.82%", 22, "Bronze", 1, 0, 0, "2025-08-08T17:23:50"],
    ["Leki Tshomo", "65.09%", 15455, 782, "100.26%", 30, "Diamond", 1, 3, 0, "2025-11-03T20:48:32"],
    ["Lhakey Choden", "83.23%", 2665, 459, "58.85%", 20, "Bronze", 0, 2, 0, "2025-09-09T13:46:40"],
    ["Melan Rai", "71.27%", 7520, 448, "57.44%", 21, "Bronze", 1, 1, 0, "2025-08-28T13:22:57"],
    ["Mercy Jeshron Subba", "67.59%", 7630, 786, "100.77%", 31, "Diamond", 3, 0, 0, "2025-10-15T15:00:19"],
    ["Najimul Mia", "56.79%", 10148, 788, "101.03%", 30, "Diamond", 3, 1, 1, "2025-08-29T19:06:48"],
    ["Nima Kelwang Keltshok", "77.49%", 5491, 785, "100.64%", 30, "Diamond", 4, 0, 0, "2025-05-13T17:56:59"],
    ["Radha Dulal", "72.74%", 7431, 800, "102.56%", 31, "Diamond", 3, 1, 0, "2025-09-10T17:06:07"],
    ["Rigyel Singer", "76.60%", 10525, 787, "100.90%", 30, "Diamond", 0, 4, 1, "2025-10-08T13:28:29"],
    ["Susil Acharja", "73.73%", 5372, 794, "101.79%", 31, "Diamond", 4, 0, 0, "2025-06-08T19:19:10"],
    ["Tashi Tshokey Wangmo", "81.97%", 9897, 800, "102.56%", 30, "Diamond", 4, 0, 0, "2025-08-20T12:29:57"],
    ["Tashi Wangchuk", "85.39%", 5708, 472, "60.51%", 22, "Bronze", 0, 3, 0, "2025-09-08T12:30:39"],
    ["Tenzin Sonam Dolkar", "80.88%", 9247, 808, "103.59%", 31, "Diamond", 1, 2, 0, "2025-09-29T13:38:06"],
    ["Yeshey Tshoki", "85.73%", 2958, 412, "52.82%", 19, "Bronze", 1, 0, 0, "2025-08-06T14:36:48"],
    ["Yogira Kami", "80.36%", 7782, 783, "100.38%", 31, "Diamond", 2, 0, 0, "2025-10-08T13:25:35"],
]

df = pd.DataFrame(data, columns=[
    "Name", "Accuracy", "Time (total)", "Score (780)", "Score (%)",
    "Exercises started", "Trophies", "Easy", "Moderate", "Hard", "Last submission date"
])

# Clean percentage columns
df["Accuracy"] = df["Accuracy"].str.rstrip('%').astype(float)
df["Score (%)"] = df["Score (%)"].str.rstrip('%').astype(float)

# Extract variables
x = df["Accuracy"].values
y = df["Score (%)"].values

# --- Compute cluster centers & std devs by Trophy (3 clusters) ---
grouped = df.groupby("Trophies")[["Accuracy", "Score (%)"]]
mux = grouped.mean()["Accuracy"].values        # [Bronze, Diamond, Gold]
muy = grouped.mean()["Score (%)"].values
stdx = grouped.std()["Accuracy"].fillna(5.0).values   # fallback std if only 1 sample (e.g., Gold)
stdy = grouped.std()["Score (%)"].fillna(8.0).values

# Ensure order: Bronze, Diamond, Gold (for consistent plotting)
order = ["Bronze", "Diamond", "Gold"]
mux = grouped.mean().loc[order, "Accuracy"].values
muy = grouped.mean().loc[order, "Score (%)"].values
stdx = grouped.std().loc[order, "Accuracy"].fillna(5.0).values
stdy = grouped.std().loc[order, "Score (%)"].fillna(8.0).values

# --- Plot ---
plt.figure(figsize=(9, 7))
plt.plot(x, y, 'o', color='steelblue', alpha=0.7, markersize=6, label='Students')

# Error bars: horizontal (±σ_x), vertical (±σ_y)
colors = ['#CD7F32', '#B9F2FF', '#FFD700']  # Bronze, Diamond, Gold
for i, trophy in enumerate(order):
    plt.errorbar(
        mux[i], muy[i],
        xerr=stdx[i], yerr=stdy[i],
        fmt='D',  # diamond marker
        color=colors[i],
        ecolor=colors[i],
        elinewidth=2,
        capsize=5,
        markersize=12,
        markeredgecolor='k',
        markeredgewidth=1.5,
        label=f'{trophy} Center ± Std'
    )

plt.title('Student Performance Clusters with Uncertainty (Trophy-Based)', fontsize=14)
plt.xlabel('Accuracy (%)', fontsize=12)
plt.ylabel('Score (%)', fontsize=12)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig("cluster_centers_with_errorbars.png", dpi=150, bbox_inches='tight')
plt.show()
No description has been provided for this image
In [ ]: