Fitting a Machine Learning Model into my function¶
InĀ [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
# 1. Load your dataset
df = pd.read_csv("datasets/enhanced_student_habits_performance_dataset.csv")
# OPTIONAL: Print columns so you know what to edit
print(df.columns)
# 2. Select features (Edit these according to your dataset)
feature_cols = ['study_hours_per_day', 'social_media_hours', 'netflix_hours']
# 3. Prepare features (X) and target (y)
X = df[feature_cols].fillna(0).values
y = df['exam_score'].values
# 4. Split into training and test sets
X_train, X_test, y_train, y_test, students_train, students_test = train_test_split(
X,
y,
df['student_id'], # keep this column for labelling bar graph
test_size=0.2,
random_state=42
)
# 5. Build regression model (Linear Regression)
model = LinearRegression()
model.fit(X_train, y_train)
# 6. Predict and evaluate
y_pred = model.predict(X_test)
print("\n--- MODEL PERFORMANCE ---")
print("MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
# 7. Visualise actual vs predicted exam score as bar graph
plt.figure(figsize=(16, 6))
# Actual exam scores
plt.bar(students_test, y_test, width=0.4, alpha=0.6, label='Actual Score')
# Predicted exam scores
plt.bar(students_test, y_pred, width=0.4, alpha=0.7, label='Predicted Score')
plt.xlabel("Student")
plt.ylabel("Exam Score")
plt.title("Actual vs Predicted Exam Score Based on Student Habits")
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()
Index(['student_id', 'age', 'gender', 'major', 'study_hours_per_day',
'social_media_hours', 'netflix_hours', 'part_time_job',
'attendance_percentage', 'sleep_hours', 'diet_quality',
'exercise_frequency', 'parental_education_level', 'internet_quality',
'mental_health_rating', 'extracurricular_participation', 'previous_gpa',
'semester', 'stress_level', 'dropout_risk', 'social_activity',
'screen_time', 'study_environment', 'access_to_tutoring',
'family_income_range', 'parental_support_level', 'motivation_level',
'exam_anxiety_score', 'learning_style', 'time_management_score',
'exam_score'],
dtype='object')
--- MODEL PERFORMANCE ---
MSE: 127.15684797454337
R²: 0.06079950438357762
Coefficients: [ 1.3885396 -0.05227858 0.01108202]
Intercept: 83.46585233056746
/tmp/ipykernel_29914/4200213448.py:57: UserWarning: Creating legend with loc="best" can be slow with large amounts of data. plt.tight_layout() /opt/conda/lib/python3.13/site-packages/IPython/core/pylabtools.py:170: UserWarning: Creating legend with loc="best" can be slow with large amounts of data. fig.canvas.print_figure(bytes_io, **kw)
InĀ [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
# Assuming y_test and y_pred already exist from your model
x = np.arange(len(y_test))
plt.figure(figsize=(16, 6))
plt.bar(x - 0.2, y_test, width=0.4, alpha=0.6, color='skyblue', label='Actual Score')
plt.bar(x + 0.2, y_pred, width=0.4, alpha=0.7, color='salmon', label='Predicted Score')
plt.xlabel("Test Students")
plt.ylabel("Exam Score")
plt.title("Actual vs Predicted Exam Score Based on Student Habits")
plt.xticks(x) # Numbers 0,1,2,... instead of IDs
plt.legend(loc='upper right') # fixed location to avoid warning
plt.tight_layout()
plt.show()
InĀ [3]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeRegressor
# 1. Select 2 features for visualization
X_vis = df[['study_hours_per_day', 'sleep_hours']].values
y_vis = df['exam_score'].values
# 2. Train Decision Tree Regressor
model = DecisionTreeRegressor(max_depth=3)
model.fit(X_vis, y_vis)
# 3. Create a grid for plotting
x_min, x_max = X_vis[:,0].min() - 1, X_vis[:,0].max() + 1
y_min, y_max = X_vis[:,1].min() - 1, X_vis[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1))
# 4. Predict exam score for each point on the grid
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 5. Plot
plt.figure(figsize=(10,6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.coolwarm)
plt.scatter(X_vis[:,0], X_vis[:,1], c=y_vis, cmap=plt.cm.coolwarm, edgecolors='k', s=100)
plt.xlabel("Study Hours per Day")
plt.ylabel("Sleep Hours")
plt.title("Decision Tree Regression: Exam Score Prediction")
plt.colorbar(label="Exam Score")
plt.show()
InĀ [5]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeRegressor
# 1. Randomly sample 1000 rows (or all if <1000)
sample_size = min(1000, len(df))
sample_df = df.sample(n=sample_size, random_state=42)
# 2. Select 2 features for visualization
X_vis = sample_df[['study_hours_per_day', 'sleep_hours']].values
y_vis = sample_df['exam_score'].values
# 3. Train Decision Tree Regressor
model = DecisionTreeRegressor(max_depth=3)
model.fit(X_vis, y_vis)
# 4. Create a grid for plotting
x_min, x_max = X_vis[:,0].min() - 1, X_vis[:,0].max() + 1
y_min, y_max = X_vis[:,1].min() - 1, X_vis[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1))
# 5. Predict exam score for each point on the grid
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 6. Plot
plt.figure(figsize=(10,6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.coolwarm)
plt.scatter(X_vis[:,0], X_vis[:,1], c=y_vis, cmap=plt.cm.coolwarm, edgecolors='k', s=50)
plt.xlabel("Study Hours per Day")
plt.ylabel("Sleep Hours")
plt.title("Decision Tree Regression: Exam Score Prediction (Sample of 1000)")
plt.colorbar(label="Exam Score")
plt.show()
InĀ [6]:
import matplotlib.pyplot as plt
import numpy as np
# Sample 1000 points (or all if less than 1000)
sample_size = min(1000, len(y_test))
indices = np.random.choice(len(y_test), size=sample_size, replace=False)
y_test_sample = y_test[indices]
y_pred_sample = y_pred[indices]
plt.figure(figsize=(10,6))
plt.scatter(y_test_sample, y_pred_sample, alpha=0.6, color='dodgerblue', edgecolors='k')
plt.plot([y_test_sample.min(), y_test_sample.max()],
[y_test_sample.min(), y_test_sample.max()],
color='red', linewidth=2, linestyle='--') # perfect prediction line
plt.xlabel("Actual Exam Score")
plt.ylabel("Predicted Exam Score")
plt.title("Actual vs Predicted Exam Score (Sample of 1000 Students)")
plt.grid(True)
plt.show()
InĀ [7]:
import matplotlib.pyplot as plt
import numpy as np
sample_size = min(50, len(y_test)) # smaller sample for clarity
indices = np.random.choice(len(y_test), size=sample_size, replace=False)
y_test_sample = y_test[indices]
y_pred_sample = y_pred[indices]
x = np.arange(len(y_test_sample))
plt.figure(figsize=(12,6))
plt.bar(x - 0.2, y_test_sample, width=0.4, alpha=0.6, color='skyblue', label='Actual')
plt.bar(x + 0.2, y_pred_sample, width=0.4, alpha=0.7, color='salmon', label='Predicted')
plt.xlabel("Sampled Students")
plt.ylabel("Exam Score")
plt.title("Actual vs Predicted Exam Score (Sample of 50 Students)")
plt.xticks(x)
plt.legend()
plt.tight_layout()
plt.show()
InĀ [8]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Random Forest MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))
Random Forest MSE: 127.08749266507718 R²: 0.06131177361690521
InĀ [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# Features and target
X = df.select_dtypes(include=['number']).drop(columns=['exam_score']).values
y = df['exam_score'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Linear Regression MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))
Linear Regression MSE: 17.540846394277686 R²: 0.8704405473283257
InĀ [Ā ]: