import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the CSV file
df = pd.read_csv('datasets/viii_2023.csv')

# Clean the data - remove summary rows at the bottom and empty rows
# First, find where the actual student data ends
df_clean = df.copy()

# Remove completely empty rows
df_clean = df_clean.dropna(how='all')

# Remove the summary rows at the bottom (which contain totals and averages)
# These rows have NaN or weird values in the student name column
mask = ~df_clean.iloc[:, 0].astype(str).str.contains('S.M=|,|\.\.', na=False)
df_clean = df_clean[mask]

# Also remove any rows where name is NaN
df_clean = df_clean.dropna(subset=[df_clean.columns[0]])

# Reset index
df_clean = df_clean.reset_index(drop=True)

# Clean column names (remove any whitespace)
df_clean.columns = [col.strip() if isinstance(col, str) else col for col in df_clean.columns]

# Rename the first column to 'Name'
df_clean = df_clean.rename(columns={df_clean.columns[0]: 'Name'})

# Convert relevant columns to numeric
subject_columns = ['Dzongkha', 'English', 'Maths']
for col in subject_columns:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

# Calculate means for the selected subjects
means = df_clean[subject_columns].mean()

print("Mean Marks Analysis:")
print("=" * 40)
for subject, mean in means.items():
    print(f"{subject}: {mean:.2f}")

print("\n" + "=" * 40)
print(f"Highest mean: {means.idxmax()} ({means.max():.2f})")
print(f"Lowest mean: {means.idxmin()} ({means.min():.2f})")

# Create visualization
plt.figure(figsize=(12, 8))

# Set style
sns.set_style("whitegrid")
sns.set_palette("husl")

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Bar chart for mean marks
ax1 = axes[0, 0]
bars = ax1.bar(means.index, means.values, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
ax1.set_title('Mean Marks Comparison: Dzongkha, English, and Maths', fontsize=14, fontweight='bold')
ax1.set_ylabel('Mean Score', fontsize=12)
ax1.set_xlabel('Subjects', fontsize=12)
ax1.set_ylim(0, max(means.values) * 1.1)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.5,
             f'{height:.2f}', ha='center', va='bottom', fontweight='bold')

# 2. Box plot for distribution
ax2 = axes[0, 1]
box_data = [df_clean[col].dropna() for col in subject_columns]
bp = ax2.boxplot(box_data, labels=subject_columns, patch_artist=True)
ax2.set_title('Score Distribution by Subject', fontsize=14, fontweight='bold')
ax2.set_ylabel('Scores', fontsize=12)
ax2.set_xlabel('Subjects', fontsize=12)

# Color the boxes
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)

# 3. Individual student performance
ax3 = axes[1, 0]
sample_students = df_clean.head(10)  # Show first 10 students for clarity
x = np.arange(len(sample_students))
width = 0.25

bars1 = ax3.bar(x - width, sample_students['Dzongkha'], width, label='Dzongkha', color='#FF6B6B')
bars2 = ax3.bar(x, sample_students['English'], width, label='English', color='#4ECDC4')
bars3 = ax3.bar(x + width, sample_students['Maths'], width, label='Maths', color='#45B7D1')

ax3.set_title('Individual Student Performance (First 10 Students)', fontsize=14, fontweight='bold')
ax3.set_ylabel('Scores', fontsize=12)
ax3.set_xlabel('Students', fontsize=12)
ax3.set_xticks(x)
ax3.set_xticklabels(sample_students['Name'], rotation=45, ha='right')
ax3.legend()
ax3.tick_params(axis='x', labelsize=9)

# 4. Pie chart for comparison
ax4 = axes[1, 1]
# Normalize means to show proportion
normalized_means = means / means.sum()
wedges, texts, autotexts = ax4.pie(normalized_means, labels=subject_columns, autopct='%1.1f%%',
                                    colors=['#FF6B6B', '#4ECDC4', '#45B7D1'],
                                    startangle=90, explode=(0.05, 0.05, 0.05))
ax4.set_title('Proportion of Mean Scores', fontsize=14, fontweight='bold')

# Make the pie chart more readable
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')

plt.suptitle('Class VIII 2023: Analysis of Dzongkha, English, and Maths Performance', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# Additional analysis
print("\n" + "=" * 40)
print("Additional Analysis:")
print("=" * 40)

# Correlation between subjects
correlation_matrix = df_clean[subject_columns].corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

# Top performers in each subject
print("\nTop 3 Performers in Each Subject:")
for subject in subject_columns:
    top_3 = df_clean.nlargest(3, subject)[['Name', subject]]
    print(f"\n{subject}:")
    for idx, row in top_3.iterrows():
        print(f"  {row['Name']}: {row[subject]:.2f}")

# Save the cleaned data for reference
df_clean.to_csv('cleaned_student_data.csv', index=False)
print("\nCleaned data has been saved to 'cleaned_student_data.csv'")

<>:18: SyntaxWarning: invalid escape sequence '\.'
<>:18: SyntaxWarning: invalid escape sequence '\.'
/tmp/ipykernel_125/2371279958.py:18: SyntaxWarning: invalid escape sequence '\.'
  mask = ~df_clean.iloc[:, 0].astype(str).str.contains('S.M=|,|\.\.', na=False)

Mean Marks Analysis:
========================================
Dzongkha: 71.33
English: 73.82
Maths: 67.63

========================================
Highest mean: English (73.82)
Lowest mean: Maths (67.63)

/tmp/ipykernel_125/2371279958.py:77: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.
  bp = ax2.boxplot(box_data, labels=subject_columns, patch_artist=True)

<Figure size 1200x800 with 0 Axes>

========================================
Additional Analysis:
========================================

Correlation Matrix:
          Dzongkha   English     Maths
Dzongkha  1.000000  0.293538  0.349606
English   0.293538  1.000000  0.668332
Maths     0.349606  0.668332  1.000000

Top 3 Performers in Each Subject:

Dzongkha:
  Sonam Tobgay Gyeltshen: 82.94
  Khandu Lham: 81.19
  Tshering Pelden: 81.13

English:
  Sonam Tobgay Gyeltshen: 85.78
  Sonam Wangmo: 85.70
  Tenzin Wangyal Tshering: 83.95

Maths:
  Kinley Zam: 87.28
  Sonam Tobgay Gyeltshen: 87.20
  Tenzin Wangyal Tshering: 79.85

Cleaned data has been saved to 'cleaned_student_data.csv'

Week 2: Graphing using tools¶

Lession¶

Assignments: We are asked to choose one dataset and create the graphs.¶

Explanation¶