import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# 1. Load dataset
df = pd.read_csv("datasets/viii_2023.csv")

# Clean column names
df.columns = df.columns.str.strip().str.replace('\n', '')

# Remove completely empty rows
df = df.dropna(how='all').reset_index(drop=True)

# 2. Convert numeric columns
df_numeric = df.copy()
for col in df_numeric.columns:
    if col.lower() != 'name':
        df_numeric[col] = pd.to_numeric(
            df_numeric[col].astype(str).str.replace(r'[^0-9.]', '', regex=True),
            errors='coerce'
        )

# Get numeric columns
numeric_cols = df_numeric.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric subject columns detected:", numeric_cols)

if len(numeric_cols) == 0:
    raise ValueError("❌ No numeric columns could be converted! Check your CSV content.")

# 3. ML Polynomial fit & plot
plt.figure(figsize=(14, 8))
x = np.arange(len(df_numeric))

for subject in numeric_cols:
    y = df_numeric[subject].values

    # Skip subject if all values are NaN
    if np.all(np.isnan(y)):
        print(f"⚠ Skipping {subject}: all values are NaN")
        continue

    # Remove NaNs for fitting
    x_clean = x[~np.isnan(y)]
    y_clean = y[~np.isnan(y)]

    # Polynomial regression (degree 2)
    poly = PolynomialFeatures(degree=2)
    X_poly = poly.fit_transform(x_clean.reshape(-1, 1))
    model = LinearRegression().fit(X_poly, y_clean)
    y_pred = model.predict(X_poly)

    # Plot actual & fitted
    plt.plot(x_clean, y_clean, marker='o', label=f"{subject} — Actual")
    plt.plot(x_clean, y_pred, linestyle='--', label=f"{subject} — Model Fit")

# Graph settings
plt.title("Students' Marks with ML Model Fit")
plt.xlabel("Student Index")
plt.ylabel("Marks")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

Numeric subject columns detected: ['Unnamed: 0', 'Dzongkha', 'English', 'Geography', 'History', 'ICT', 'Maths', 'Science', 'Unnamed: 8']
⚠ Skipping Unnamed: 0: all values are NaN
⚠ Skipping Unnamed: 8: all values are NaN

Week 4: Machine Learning(28 November 2025)¶

Lession about¶

Assignments: We are asked to Fit a machine learning model to our datasets¶

Explanation¶