import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv('datasets/viii_2023.csv', header=0)

# The first column contains names but has no header
# Let's properly name the columns
df.columns = ['Name', 'Dzongkha', 'English', 'Geography', 'History', 'ICT', 'Maths', 'Science', 'Result']

# Remove summary rows and empty rows
df = df[df['Name'].notna()]  # Remove rows where Name is NaN
df = df[~df['Name'].str.startswith(('S.M=', ','), na=False)]  # Remove summary rows

# Convert score columns to numeric
subject_cols = ['Dzongkha', 'English', 'Geography', 'History', 'ICT', 'Maths', 'Science']
for col in subject_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Create spectrogram visualization
plt.figure(figsize=(14, 8))

# Prepare data matrix (students × subjects)
score_matrix = df[subject_cols].values

# Create heatmap (spectrogram-like visualization)
plt.imshow(score_matrix.T, aspect='auto', cmap='YlOrRd', 
           interpolation='bilinear', origin='lower')

# Add colorbar and labels
cbar = plt.colorbar(label='Score (0-100)')
plt.title('Student Performance "Spectrogram" - Class VIII (2023)', fontsize=14, fontweight='bold')
plt.xlabel('Student Index')
plt.ylabel('Subject')

# Set y-ticks with subject names
plt.yticks(range(len(subject_cols)), subject_cols)

# Add horizontal lines for better readability
for i in range(len(subject_cols) + 1):
    plt.axhline(i - 0.5, color='white', linewidth=0.5, alpha=0.3)

plt.tight_layout()
plt.show()

# Performance Analysis
print("=== PERFORMANCE ANALYSIS ===")
print(f"Total Students: {len(df)}")
print(f"Pass Rate: {(df['Result'] == 'Pass').sum() / len(df) * 100:.1f}%")

# Calculate statistics
print("\n=== SUBJECT STATISTICS ===")
stats_df = df[subject_cols].agg(['mean', 'std', 'min', 'max']).T
stats_df.columns = ['Average', 'Std Dev', 'Minimum', 'Maximum']
print(stats_df.round(2))

# Top performers
print("\n=== TOP 5 STUDENTS ===")
df['Total'] = df[subject_cols].sum(axis=1)
top_students = df.nlargest(5, 'Total')[['Name', 'Total']]
for idx, (name, total) in enumerate(top_students.values, 1):
    avg = total / len(subject_cols)
    print(f"{idx}. {name}: {total:.1f} total, {avg:.2f} average")

# Subject correlation heatmap (additional analysis)
print("\n=== SUBJECT CORRELATION ===")
correlation_matrix = df[subject_cols].corr()
print(correlation_matrix.round(2))

=== PERFORMANCE ANALYSIS ===
Total Students: 30
Pass Rate: 90.0%

=== SUBJECT STATISTICS ===
           Average  Std Dev  Minimum  Maximum
Dzongkha     71.33     6.45    54.07    82.94
English      73.82     6.71    59.80    85.78
Geography    71.68     8.36    57.00    88.25
History      71.93     6.30    60.45    82.70
ICT          60.98     7.24    47.06    75.31
Maths        67.63     8.98    53.88    87.28
Science      62.16     9.51    48.35    86.50

=== TOP 5 STUDENTS ===
1. Sonam Tobgay Gyeltshen: 587.2 total, 83.89 average
2. Kinley Zam: 573.8 total, 81.97 average
3. Tenzin Wangyal Tshering: 547.2 total, 78.17 average
4. Sonam Wangmo: 527.9 total, 75.41 average
5. Sujandeep Sunar: 522.4 total, 74.63 average

=== SUBJECT CORRELATION ===
           Dzongkha  English  Geography  History   ICT  Maths  Science
Dzongkha       1.00     0.29       0.50     0.44  0.39   0.35     0.39
English        0.29     1.00       0.63     0.68  0.72   0.67     0.69
Geography      0.50     0.63       1.00     0.74  0.76   0.75     0.85
History        0.44     0.68       0.74     1.00  0.72   0.64     0.77
ICT            0.39     0.72       0.76     0.72  1.00   0.79     0.81
Maths          0.35     0.67       0.75     0.64  0.79   1.00     0.84
Science        0.39     0.69       0.85     0.77  0.81   0.84     1.00

week07: transform¶

Brief Explanation¶