import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# 1. Load your dataset
df = pd.read_csv("datasets/enhanced_student_habits_performance_dataset.csv")

# OPTIONAL: Print columns so you know what to edit
print(df.columns)

# 2. Select features (Edit these according to your dataset)
feature_cols = ['study_hours_per_day', 'social_media_hours', 'netflix_hours']

# 3. Prepare features (X) and target (y)
X = df[feature_cols].fillna(0).values
y = df['exam_score'].values

# 4. Split into training and test sets
X_train, X_test, y_train, y_test, students_train, students_test = train_test_split(
    X,
    y,
    df['student_id'],        # keep this column for labelling bar graph
    test_size=0.2,
    random_state=42
)

# 5. Build regression model (Linear Regression)
model = LinearRegression()
model.fit(X_train, y_train)

# 6. Predict and evaluate
y_pred = model.predict(X_test)

print("\n--- MODEL PERFORMANCE ---")
print("MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

# 7. Visualise actual vs predicted exam score as bar graph
plt.figure(figsize=(16, 6))

# Actual exam scores
plt.bar(students_test, y_test, width=0.4, alpha=0.6, label='Actual Score')

# Predicted exam scores
plt.bar(students_test, y_pred, width=0.4, alpha=0.7, label='Predicted Score')

plt.xlabel("Student")
plt.ylabel("Exam Score")
plt.title("Actual vs Predicted Exam Score Based on Student Habits")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

Index(['student_id', 'age', 'gender', 'major', 'study_hours_per_day',
       'social_media_hours', 'netflix_hours', 'part_time_job',
       'attendance_percentage', 'sleep_hours', 'diet_quality',
       'exercise_frequency', 'parental_education_level', 'internet_quality',
       'mental_health_rating', 'extracurricular_participation', 'previous_gpa',
       'semester', 'stress_level', 'dropout_risk', 'social_activity',
       'screen_time', 'study_environment', 'access_to_tutoring',
       'family_income_range', 'parental_support_level', 'motivation_level',
       'exam_anxiety_score', 'learning_style', 'time_management_score',
       'exam_score'],
      dtype='object')

--- MODEL PERFORMANCE ---
MSE: 127.15684797454337
R²: 0.06079950438357762
Coefficients: [ 1.3885396  -0.05227858  0.01108202]
Intercept: 83.46585233056746

/tmp/ipykernel_29914/4200213448.py:57: UserWarning: Creating legend with loc="best" can be slow with large amounts of data.
  plt.tight_layout()
/opt/conda/lib/python3.13/site-packages/IPython/core/pylabtools.py:170: UserWarning: Creating legend with loc="best" can be slow with large amounts of data.
  fig.canvas.print_figure(bytes_io, **kw)

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

# Assuming y_test and y_pred already exist from your model
x = np.arange(len(y_test))

plt.figure(figsize=(16, 6))

plt.bar(x - 0.2, y_test, width=0.4, alpha=0.6, color='skyblue', label='Actual Score')
plt.bar(x + 0.2, y_pred, width=0.4, alpha=0.7, color='salmon', label='Predicted Score')

plt.xlabel("Test Students")
plt.ylabel("Exam Score")
plt.title("Actual vs Predicted Exam Score Based on Student Habits")
plt.xticks(x)  # Numbers 0,1,2,... instead of IDs
plt.legend(loc='upper right')  # fixed location to avoid warning
plt.tight_layout()
plt.show()

import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeRegressor

# 1. Select 2 features for visualization
X_vis = df[['study_hours_per_day', 'sleep_hours']].values
y_vis = df['exam_score'].values

# 2. Train Decision Tree Regressor
model = DecisionTreeRegressor(max_depth=3)
model.fit(X_vis, y_vis)

# 3. Create a grid for plotting
x_min, x_max = X_vis[:,0].min() - 1, X_vis[:,0].max() + 1
y_min, y_max = X_vis[:,1].min() - 1, X_vis[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

# 4. Predict exam score for each point on the grid
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# 5. Plot
plt.figure(figsize=(10,6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.coolwarm)
plt.scatter(X_vis[:,0], X_vis[:,1], c=y_vis, cmap=plt.cm.coolwarm, edgecolors='k', s=100)
plt.xlabel("Study Hours per Day")
plt.ylabel("Sleep Hours")
plt.title("Decision Tree Regression: Exam Score Prediction")
plt.colorbar(label="Exam Score")
plt.show()

import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeRegressor

# 1. Randomly sample 1000 rows (or all if <1000)
sample_size = min(1000, len(df))
sample_df = df.sample(n=sample_size, random_state=42)

# 2. Select 2 features for visualization
X_vis = sample_df[['study_hours_per_day', 'sleep_hours']].values
y_vis = sample_df['exam_score'].values

# 3. Train Decision Tree Regressor
model = DecisionTreeRegressor(max_depth=3)
model.fit(X_vis, y_vis)

# 4. Create a grid for plotting
x_min, x_max = X_vis[:,0].min() - 1, X_vis[:,0].max() + 1
y_min, y_max = X_vis[:,1].min() - 1, X_vis[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

# 5. Predict exam score for each point on the grid
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# 6. Plot
plt.figure(figsize=(10,6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.coolwarm)
plt.scatter(X_vis[:,0], X_vis[:,1], c=y_vis, cmap=plt.cm.coolwarm, edgecolors='k', s=50)
plt.xlabel("Study Hours per Day")
plt.ylabel("Sleep Hours")
plt.title("Decision Tree Regression: Exam Score Prediction (Sample of 1000)")
plt.colorbar(label="Exam Score")
plt.show()

import matplotlib.pyplot as plt
import numpy as np

# Sample 1000 points (or all if less than 1000)
sample_size = min(1000, len(y_test))
indices = np.random.choice(len(y_test), size=sample_size, replace=False)

y_test_sample = y_test[indices]
y_pred_sample = y_pred[indices]

plt.figure(figsize=(10,6))
plt.scatter(y_test_sample, y_pred_sample, alpha=0.6, color='dodgerblue', edgecolors='k')
plt.plot([y_test_sample.min(), y_test_sample.max()],
         [y_test_sample.min(), y_test_sample.max()],
         color='red', linewidth=2, linestyle='--')  # perfect prediction line
plt.xlabel("Actual Exam Score")
plt.ylabel("Predicted Exam Score")
plt.title("Actual vs Predicted Exam Score (Sample of 1000 Students)")
plt.grid(True)
plt.show()

import matplotlib.pyplot as plt
import numpy as np

sample_size = min(50, len(y_test))  # smaller sample for clarity
indices = np.random.choice(len(y_test), size=sample_size, replace=False)

y_test_sample = y_test[indices]
y_pred_sample = y_pred[indices]
x = np.arange(len(y_test_sample))

plt.figure(figsize=(12,6))
plt.bar(x - 0.2, y_test_sample, width=0.4, alpha=0.6, color='skyblue', label='Actual')
plt.bar(x + 0.2, y_pred_sample, width=0.4, alpha=0.7, color='salmon', label='Predicted')
plt.xlabel("Sampled Students")
plt.ylabel("Exam Score")
plt.title("Actual vs Predicted Exam Score (Sample of 50 Students)")
plt.xticks(x)
plt.legend()
plt.tight_layout()
plt.show()

from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Random Forest MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

Random Forest MSE: 127.08749266507718
R²: 0.06131177361690521

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Features and target
X = df.select_dtypes(include=['number']).drop(columns=['exam_score']).values
y = df['exam_score'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Linear Regression MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

Linear Regression MSE: 17.540846394277686
R²: 0.8704405473283257

Fitting a Machine Learning Model into my function¶