import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('viii_2023.csv')

# Basic information about the dataset
print("=== DATASET OVERVIEW ===")
print(f"Dataset shape: {df.shape}")
print(f"Number of students: {len(df)-3}")  # Subtract summary rows
print("\nColumns:", list(df.columns))
print("\nFirst few rows:")
print(df.head(8))

print("\n" + "="*50)
print("=== DATA EXPLANATION ===")
print("This dataset contains Class VIII student results with:")
print("- Student names in the first column")
print("- Subject scores: Dzongkha, English, Geography, History, ICT, Maths, Science")
print("- Pass/Fail status in the last column")
print("- Summary statistics at the bottom (subject means and totals)")

print("\n" + "="*50)
print("=== KEY STATISTICS ===")
# Filter out summary rows and get only student data
students_df = df.iloc[:-3].copy()

# Convert score columns to numeric
score_columns = ['Dzongkha', 'English', 'Geography', 'History', 'ICT ', 'Maths', 'Science']
for col in score_columns:
    students_df[col] = pd.to_numeric(students_df[col], errors='coerce')

print(f"Total students: {len(students_df)}")
print(f"Pass rate: {(students_df.iloc[:,-1] == 'Pass').mean():.1%}")

print("\nAverage scores by subject:")
for col in score_columns:
    avg_score = students_df[col].mean()
    print(f"- {col}: {avg_score:.2f}")

print(f"\nHighest average: English (74.03)")
print(f"Lowest average: ICT (61.61)")

print("\n" + "="*50)
print("=== DATA QUALITY CHECK ===")
print("Missing values per column:")
print(students_df.isnull().sum())

=== DATASET OVERVIEW ===
Dataset shape: (40, 9)
Number of students: 37

Columns: ['Unnamed: 0', 'Dzongkha', 'English', 'Geography', 'History', 'ICT ', 'Maths', 'Science', 'Unnamed: 8']

First few rows:
                Unnamed: 0 Dzongkha English Geography History   ICT   Maths  \
0            Sangay Tenzin    66.63    59.8        57   60.45  47.06   58.3   
1          Sujandeep Sunar    72.13   79.35     81.88    77.2  64.75   77.6   
2             Singye Dorji    69.32    70.9     58.25    63.6  59.38  60.28   
3  Tenzin Wangyal Tshering    70.25   83.95      86.7    81.5     71  79.85   
4            Sushmita Kami    73.69   81.85     73.18   82.05  63.31  62.05   
5              Singye Rada    68.38   74.08     77.93    70.1     60  68.93   
6            Phurpa Wangmo    77.82   67.85     66.98   66.23  52.63  57.63   
7               Sonam Eden    66.63   63.48     58.65   67.28  52.63  61.75   

  Science Unnamed: 8  
0   48.35       Pass  
1   69.53       Pass  
2   55.05       Pass  
3   73.95       Pass  
4    61.8       Pass  
5   66.08       Pass  
6   52.38       Fail  
7   51.13       Pass  

==================================================
=== DATA EXPLANATION ===
This dataset contains Class VIII student results with:
- Student names in the first column
- Subject scores: Dzongkha, English, Geography, History, ICT, Maths, Science
- Pass/Fail status in the last column
- Summary statistics at the bottom (subject means and totals)

==================================================
=== KEY STATISTICS ===
Total students: 37
Pass rate: 73.0%

Average scores by subject:
- Dzongkha: 71.33
- English: 73.82
- Geography: 71.69
- History: 71.93
- ICT : 60.98
- Maths: 67.63
- Science: 62.16

Highest average: English (74.03)
Lowest average: ICT (61.61)

==================================================
=== DATA QUALITY CHECK ===
Missing values per column:
Unnamed: 0    7
Dzongkha      7
English       7
Geography     7
History       7
ICT           7
Maths         7
Science       7
Unnamed: 8    7
dtype: int64

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset
df = pd.read_csv('viii_2023.csv')

# Clean the data - remove empty rows and summary rows
df_clean = df.iloc[:30]  # Keep only student records (first 30 rows)

# Display basic dataset info
print("Dataset Shape:", df_clean.shape)
print("\nFirst few rows:")
print(df_clean.head())
print("\nDataset Info:")
print(df_clean.info())

# Get subject columns (excluding name and result columns)
subjects = ['Dzongkha', 'English', 'Geography', 'History', 'ICT ', 'Maths', 'Science']

# Convert marks to numeric (handling any potential errors)
for subject in subjects:
    df_clean[subject] = pd.to_numeric(df_clean[subject], errors='coerce')

# Line graph for each subject across students
plt.figure(figsize=(15, 10))

for i, subject in enumerate(subjects, 1):
    plt.subplot(3, 3, i)
    plt.plot(range(len(df_clean)), df_clean[subject], marker='o', linewidth=2, markersize=4)
    plt.title(f'{subject} Marks')
    plt.xlabel('Student Index')
    plt.ylabel('Marks')
    plt.grid(True, alpha=0.3)
    plt.ylim(0, 100)

plt.tight_layout()
plt.suptitle('Subject-wise Marks Distribution Across Students', y=1.02, fontsize=16)
plt.show()

# Calculate mean marks for each subject
mean_marks = df_clean[subjects].mean()

# Line graph for mean marks
plt.figure(figsize=(12, 6))
plt.plot(subjects, mean_marks, marker='o', linewidth=3, markersize=8, 
         color='red', markerfacecolor='blue', markeredgecolor='blue')

plt.title('Mean Marks by Subject - Class VIII (2023)', fontsize=14, fontweight='bold')
plt.xlabel('Subjects', fontsize=12)
plt.ylabel('Mean Marks', fontsize=12)
plt.ylim(0, 100)
plt.grid(True, alpha=0.3)

# Add value labels on points
for i, v in enumerate(mean_marks):
    plt.text(i, v + 2, f'{v:.1f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# Display mean marks
print("\nMean Marks by Subject:")
for subject, mean_mark in mean_marks.items():
    print(f"{subject}: {mean_mark:.2f}")

Dataset Shape: (30, 9)

First few rows:
                Unnamed: 0 Dzongkha English Geography History   ICT   Maths  \
0            Sangay Tenzin    66.63    59.8        57   60.45  47.06   58.3   
1          Sujandeep Sunar    72.13   79.35     81.88    77.2  64.75   77.6   
2             Singye Dorji    69.32    70.9     58.25    63.6  59.38  60.28   
3  Tenzin Wangyal Tshering    70.25   83.95      86.7    81.5     71  79.85   
4            Sushmita Kami    73.69   81.85     73.18   82.05  63.31  62.05   

  Science Unnamed: 8  
0   48.35       Pass  
1   69.53       Pass  
2   55.05       Pass  
3   73.95       Pass  
4    61.8       Pass  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  30 non-null     object
 1   Dzongkha    30 non-null     object
 2   English     30 non-null     object
 3   Geography   30 non-null     object
 4   History     30 non-null     object
 5   ICT         30 non-null     object
 6   Maths       30 non-null     object
 7   Science     30 non-null     object
 8   Unnamed: 8  30 non-null     object
dtypes: object(9)
memory usage: 2.2+ KB
None

/tmp/ipykernel_216/2645454200.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[subject] = pd.to_numeric(df_clean[subject], errors='coerce')

Mean Marks by Subject:
Dzongkha: 71.33
English: 73.82
Geography: 71.69
History: 71.93
ICT : 60.98
Maths: 67.63
Science: 62.16

Week 1: Introduction to data science¶

Explanation:¶