import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import scipy.stats as stats

file_path = "datasets/enhanced_student_habits_performance_dataset.csv" 
df = pd.read_csv(file_path)

df.head()

distinct_count = df.drop_duplicates().shape[0]

print(distinct_count)

80000

max_study_value = df["study_hours_per_day"].max()
print(max_study_value)
min_study_value = df["study_hours_per_day"].min()
print(min_study_value)

12.0
0.0

max_moti_value = df["motivation_level"].max()
print(max_moti_value)
min_moti_value = df["motivation_level"].min()
print(min_moti_value)

10
1

df.head()

majors = tuple(df["major"].unique())
print(majors)

('Computer Science', 'Arts', 'Psychology', 'Business', 'Engineering', 'Biology')

numeric_cols = df.select_dtypes(include="number").columns
corr = df[numeric_cols].corr()["exam_score"].sort_values(ascending=False)
print(corr)

exam_score                1.000000
previous_gpa              0.932940
motivation_level          0.250287
study_hours_per_day       0.241460
screen_time               0.169788
sleep_hours               0.090820
exercise_frequency        0.086983
mental_health_rating      0.010556
student_id                0.007557
time_management_score     0.005940
attendance_percentage     0.002876
semester                  0.000541
age                       0.000487
netflix_hours            -0.001271
social_activity          -0.002795
parental_support_level   -0.006333
social_media_hours       -0.006351
stress_level             -0.118550
exam_anxiety_score       -0.235909
Name: exam_score, dtype: float64

x_axis = "stress_level"
y_axis = "age"
x = df[x_axis]
y = df[y_axis]
plt.scatter(x, y)
plt.xlabel(x_axis)
plt.ylabel(y_axis)
plt.title("Scatter: " + x_axis + " vs " + y_axis)
plt.show()

cols = [
    "study_hours_per_day",
    "social_media_hours",
    "sleep_hours",
    "previous_gpa",
    "time_management_score",
    "exam_score"
]

pd.plotting.scatter_matrix(df[cols], figsize=(12, 12))
plt.show()

# Define the key numerical features to analyze
# We select the target and the top numeric predictors from the previous analysis
KEY_NUMERIC_FEATURES = [
    'exam_score', 
    'previous_gpa', 
    'study_hours_per_day', 
    'motivation_level', 
    'exam_anxiety_score',
    'sleep_hours'
]

# Drop any rows with missing values in the key features for clean analysis
df_clean = df[KEY_NUMERIC_FEATURES].dropna()

print(f"Data shape for analysis: {df_clean.shape}")
print("Features to analyze:", KEY_NUMERIC_FEATURES)

Data shape for analysis: (80000, 6)
Features to analyze: ['exam_score', 'previous_gpa', 'study_hours_per_day', 'motivation_level', 'exam_anxiety_score', 'sleep_hours']

# Function to plot distribution and Q-Q plot
def plot_distribution(data, feature):
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Histogram
    sns.histplot(data[feature], kde=True, ax=axes[0], color='skyblue')
    axes[0].set_title(f'Histogram of {feature}')
    axes[0].set_xlabel(feature)
    axes[0].set_ylabel('Frequency')
    
    # Q-Q Plot
    stats.probplot(data[feature], dist="norm", plot=axes[1])
    axes[1].set_title(f'Q-Q Plot of {feature}')
    
    plt.tight_layout()
    plt.show()

# Plot for each key feature
for feature in KEY_NUMERIC_FEATURES:
    plot_distribution(df_clean, feature)

# Function to perform Shapiro-Wilk test
def shapiro_wilk_test(data, feature):
    # The Shapiro-Wilk test is computationally expensive and is not recommended for samples > 5000.
    # Since the dataset has 80,000 rows, we will take a random sample of 5000 for the test.
    
    sample_size = 5000
    if len(data) > sample_size:
        sample = data[feature].sample(n=sample_size, random_state=42)
    else:
        sample = data[feature]
        
    stat, p = stats.shapiro(sample)
    
    print(f"\n--- Shapiro-Wilk Test for {feature} (Sample Size: {len(sample)}) ---")
    print(f'Test Statistic (W): {stat:.4f}')
    print(f'P-value: {p:.4e}')
    
    if p < 0.05:
        print("Conclusion: Reject Null Hypothesis. The sample is NOT normally distributed.")
    else:
        print("Conclusion: Fail to Reject Null Hypothesis. The sample is normally distributed.")

# Perform the test for each key feature
for feature in KEY_NUMERIC_FEATURES:
    shapiro_wilk_test(df_clean, feature)

# Final Note:
# For large datasets, even minor deviations from normality can lead to a rejection of the null hypothesis.
# In practice, visual inspection (Q-Q plot) and domain knowledge are often more important than the p-value for determining if a distribution is 'close enough' to normal for modeling purposes.

--- Shapiro-Wilk Test for exam_score (Sample Size: 5000) ---
Test Statistic (W): 0.8585
P-value: 3.1386e-55
Conclusion: Reject Null Hypothesis. The sample is NOT normally distributed.

--- Shapiro-Wilk Test for previous_gpa (Sample Size: 5000) ---
Test Statistic (W): 0.8267
P-value: 6.5840e-59
Conclusion: Reject Null Hypothesis. The sample is NOT normally distributed.

--- Shapiro-Wilk Test for study_hours_per_day (Sample Size: 5000) ---
Test Statistic (W): 0.9942
P-value: 2.4854e-13
Conclusion: Reject Null Hypothesis. The sample is NOT normally distributed.

--- Shapiro-Wilk Test for motivation_level (Sample Size: 5000) ---
Test Statistic (W): 0.9385
P-value: 2.2076e-41
Conclusion: Reject Null Hypothesis. The sample is NOT normally distributed.

--- Shapiro-Wilk Test for exam_anxiety_score (Sample Size: 5000) ---
Test Statistic (W): 0.7764
P-value: 1.0260e-63
Conclusion: Reject Null Hypothesis. The sample is NOT normally distributed.

--- Shapiro-Wilk Test for sleep_hours (Sample Size: 5000) ---
Test Statistic (W): 0.9944
P-value: 4.2268e-13
Conclusion: Reject Null Hypothesis. The sample is NOT normally distributed.

# Define the minimal feature set and the target variable
# Categorical features need encoding
FEATURES = [
    'previous_gpa', 
    'motivation_level', 
    'exam_anxiety_score', 
    'access_to_tutoring', 
    'study_environment'
]
TARGET = 'exam_score'

df = df.dropna(subset=FEATURES + [TARGET])

print(f"Original Data Shape: {df.shape}")

# 1. Data Preparation: Encoding Categorical Features
# We use One-Hot Encoding for the categorical features in our minimal set.

df_model = pd.get_dummies(df[FEATURES], columns=['access_to_tutoring', 'study_environment'], drop_first=True)

# Add the target variable back to the model dataframe
df_model[TARGET] = df[TARGET]

print(f"Data Shape after Encoding: {df_model.shape}")
print("Features used for modeling:", list(df_model.columns[:-1]))

# Define X (features) and y (target)
X = df_model.drop(columns=[TARGET])
y = df_model[TARGET]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Original Data Shape: (80000, 31)
Data Shape after Encoding: (80000, 9)
Features used for modeling: ['previous_gpa', 'motivation_level', 'exam_anxiety_score', 'access_to_tutoring_Yes', 'study_environment_Co-Learning Group', 'study_environment_Dorm', 'study_environment_Library', 'study_environment_Quiet Room']
Training set size: 64000 samples
Testing set size: 16000 samples

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R2) Score: {r2:.4f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# Interpretation:
# R2 score close to 1 means the model explains a high proportion of the variance in the exam score.
# RMSE represents the average error in the prediction, in the same units as the exam score.

R-squared (R2) Score: 0.8705
Mean Squared Error (MSE): 17.54
Root Mean Squared Error (RMSE): 4.19

plt.figure(figsize=(10, 8))

# Scatter plot of Actual vs. Predicted values
plt.scatter(y_test, y_pred, alpha=0.5, color='skyblue', label='Predicted Scores')

# Plot the perfect prediction line (y=x)
max_score = max(y_test.max(), y_pred.max())
min_score = min(y_test.min(), y_pred.min())
plt.plot([min_score, max_score], [min_score, max_score], color='red', linestyle='--', linewidth=2, label='Perfect Fit Line')

plt.title('Model Fit: Actual vs. Predicted Exam Score')
plt.xlabel('Actual Exam Score')
plt.ylabel('Predicted Exam Score')
plt.legend()
plt.grid(True)
plt.show()

# Conclusion:
# The tight clustering of points around the red line confirms the model's strong predictive power, largely driven by the inclusion of 'previous_gpa'.

	student_id	age	gender	major	study_hours_per_day	social_media_hours	netflix_hours	part_time_job	attendance_percentage	sleep_hours	...	screen_time	study_environment	access_to_tutoring	family_income_range	parental_support_level	motivation_level	exam_anxiety_score	learning_style	time_management_score	exam_score
0	100000	26	Male	Computer Science	7.645367	3.0	0.1	Yes	70.3	6.2	...	10.9	Co-Learning Group	Yes	High	9	7	8	Reading	3.0	100
1	100001	28	Male	Arts	5.700000	0.5	0.4	No	88.4	7.2	...	8.3	Co-Learning Group	Yes	Low	7	2	10	Reading	6.0	99
2	100002	17	Male	Arts	2.400000	4.2	0.7	No	82.1	9.2	...	8.0	Library	Yes	High	3	9	6	Kinesthetic	7.6	98
3	100003	27	Other	Psychology	3.400000	4.6	2.3	Yes	79.3	4.2	...	11.7	Co-Learning Group	Yes	Low	5	3	10	Reading	3.2	100
4	100004	25	Female	Business	4.700000	0.8	2.7	Yes	62.9	6.5	...	9.4	Quiet Room	Yes	Medium	9	1	10	Reading	7.1	98

	student_id	age	gender	major	study_hours_per_day	social_media_hours	netflix_hours	part_time_job	attendance_percentage	sleep_hours	...	screen_time	study_environment	access_to_tutoring	family_income_range	parental_support_level	motivation_level	exam_anxiety_score	learning_style	time_management_score	exam_score
0	100000	26	Male	Computer Science	7.645367	3.0	0.1	Yes	70.3	6.2	...	10.9	Co-Learning Group	Yes	High	9	7	8	Reading	3.0	100
1	100001	28	Male	Arts	5.700000	0.5	0.4	No	88.4	7.2	...	8.3	Co-Learning Group	Yes	Low	7	2	10	Reading	6.0	99
2	100002	17	Male	Arts	2.400000	4.2	0.7	No	82.1	9.2	...	8.0	Library	Yes	High	3	9	6	Kinesthetic	7.6	98
3	100003	27	Other	Psychology	3.400000	4.6	2.3	Yes	79.3	4.2	...	11.7	Co-Learning Group	Yes	Low	5	3	10	Reading	3.2	100
4	100004	25	Female	Business	4.700000	0.8	2.7	Yes	62.9	6.5	...	9.4	Quiet Room	Yes	Medium	9	1	10	Reading	7.1	98

Week 2 Assignment¶

Probability Distribution Analysis of Key Student Data Features¶

1. Visual Inspection: Histograms and Q-Q Plots¶

2. Statistical Testing: Shapiro-Wilk Test for Normality¶

Predictive Modeling and Visualization: Exam Score Prediction¶

2. Model Training and Evaluation¶

3. Visualization: Actual vs. Predicted Exam Score¶