import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Needed for Preprocessing (Standardization)
from sklearn.preprocessing import StandardScaler
# Needed for Dimensionality Reduction (PCA)
from sklearn.decomposition import PCA

# Load your data
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"
df = pd.read_csv(file_path)

# Extract the data matrix X
X = df[FEATURE_COLS].values 
print(f"Original data shape (N samples x D dimensions): {X.shape}")

Original data shape (N samples x D dimensions): (80000, 8)

# 1. Identify all columns with numerical data type
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

# 2. Exclude non-feature columns (identifiers or semester data)
# 'student_id' is an identifier, and 'semester' is often treated as time-based or non-habitual in structure analysis.
EXCLUDED_COLS = ['student_id', 'semester'] 

# Create the final list of features for PCA
FEATURE_COLS_FULL = [col for col in numerical_cols if col not in EXCLUDED_COLS]

# Extract the data matrix X
X = df[FEATURE_COLS_FULL].values 

D_new = X.shape[1]
print(f"New input dimensionality for PCA: D={D_new} features.")
# Example output might be D=19 or D=20 features.

New input dimensionality for PCA: D=17 features.

df.head()

# 1. Initialize the Scaler
scaler = StandardScaler()

# 2. Fit and transform the data
X_scaled = scaler.fit_transform(X)

print("Data successfully standardized (Mean should be ~0, Std Dev ~1).")
print(f"Example mean of first feature: {np.mean(X_scaled[:, 0]):.4f}")
print(f"Example std dev of first feature: {np.std(X_scaled[:, 0]):.4f}")

Data successfully standardized (Mean should be ~0, Std Dev ~1).
Example mean of first feature: -0.0000
Example std dev of first feature: 1.0000

# Initialize PCA: Keep all components initially to analyze variance distribution
pca = PCA()

# Fit PCA to the standardized data
pca.fit(X_scaled)

# The total number of components found should equal the original number of features
D = pca.n_components_
print(f"PCA fitted, {D} components found.")

PCA fitted, 17 components found.

# variance explained by each component
explained_variance_ratio = pca.explained_variance_ratio_

# 2. Cumulative explained variance
cumulative_variance = np.cumsum(explained_variance_ratio)

# 3. Find the number of components needed to explain 90% of the variance
threshold = 0.90
n_components_90 = np.argmax(cumulative_variance >= threshold) + 1

print(f"Number of components needed to retain {threshold*100}% variance: {n_components_90}")

Number of components needed to retain 90.0% variance: 13

# Plot the explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, D + 1), explained_variance_ratio, marker='o', linestyle='-', label='Individual Explained Variance')
plt.plot(range(1, D + 1), cumulative_variance, marker='x', linestyle='--', color='red', label='Cumulative Explained Variance')
plt.axhline(threshold, color='green', linestyle=':', label=f'{threshold*100}% Threshold')
plt.axvline(n_components_90, color='orange', linestyle='--', label=f'Optimal K={n_components_90}')
plt.title('PCA Explained Variance Analysis (Scree Plot)')
plt.xlabel('Principal Component Index')
plt.ylabel('Variance Explained Ratio')
plt.legend()
plt.grid(True)
plt.show()

print(pca.components_.shape)
print(len(FEATURE_COLS_FULL))

(17, 17)
17

# Create a DataFrame to easily visualize component weights (loadings)
loadings_df = pd.DataFrame(
    pca.components_,
    columns=FEATURE_COLS_FULL,   # MUST match PCA input
    index=[f'PC{i+1}' for i in range(pca.components_.shape[0])]
)

# Display the loadings for the top 3 components (or based on n_components_90)
print("\nTop Principal Component Loadings:")
print(loadings_df.iloc[:3, :].transpose())

Top Principal Component Loadings:
                             PC1       PC2       PC3
age                    -0.001279 -0.005808  0.011714
study_hours_per_day     0.289554  0.420762  0.014431
social_media_hours      0.087696  0.314310 -0.320676
netflix_hours           0.069731  0.249372 -0.237795
attendance_percentage   0.004242  0.005215  0.000222
sleep_hours             0.060273 -0.006706  0.168713
exercise_frequency      0.059854 -0.015416  0.152393
mental_health_rating    0.012714  0.002322  0.083794
previous_gpa            0.513185 -0.029192  0.383787
stress_level           -0.079227  0.006218 -0.242618
social_activity        -0.002933 -0.004837  0.003183
screen_time             0.288112  0.583269 -0.263755
parental_support_level -0.006711  0.000743  0.002090
motivation_level        0.383722 -0.400008 -0.414284
exam_anxiety_score     -0.377701  0.399700  0.422631
time_management_score   0.007523  0.003603  0.000810
exam_score              0.503542 -0.027145  0.394435

# Transform the data into the reduced PCA space (e.g., using 2 components for 2D visualization)
K_visual = 2 
pca_final = PCA(n_components=K_visual)
X_transformed = pca_final.fit_transform(X_scaled)

print(f"\nData successfully projected from {D}D to {K_visual}D.")
print(f"Transformed data shape: {X_transformed.shape}")

Data successfully projected from 17D to 2D.
Transformed data shape: (80000, 2)

# Visualize the data in the new 2D PCA space
plt.figure(figsize=(8, 8))
plt.scatter(X_transformed[:, 0], X_transformed[:, 1], alpha=0.5, s=10)
plt.xlabel(f'Principal Component 1 (PC1)')
plt.ylabel(f'Principal Component 2 (PC2)')
plt.title(f'Student Data Projected onto First {K_visual} Principal Components')
plt.grid(True)
plt.show()

	student_id	age	gender	major	study_hours_per_day	social_media_hours	netflix_hours	part_time_job	attendance_percentage	sleep_hours	...	screen_time	study_environment	access_to_tutoring	family_income_range	parental_support_level	motivation_level	exam_anxiety_score	learning_style	time_management_score	exam_score
0	100000	26	Male	Computer Science	7.645367	3.0	0.1	Yes	70.3	6.2	...	10.9	Co-Learning Group	Yes	High	9	7	8	Reading	3.0	100
1	100001	28	Male	Arts	5.700000	0.5	0.4	No	88.4	7.2	...	8.3	Co-Learning Group	Yes	Low	7	2	10	Reading	6.0	99
2	100002	17	Male	Arts	2.400000	4.2	0.7	No	82.1	9.2	...	8.0	Library	Yes	High	3	9	6	Kinesthetic	7.6	98
3	100003	27	Other	Psychology	3.400000	4.6	2.3	Yes	79.3	4.2	...	11.7	Co-Learning Group	Yes	Low	5	3	10	Reading	3.2	100
4	100004	25	Female	Business	4.700000	0.8	2.7	Yes	62.9	6.5	...	9.4	Quiet Room	Yes	Medium	9	1	10	Reading	7.1	98

Transform Assignment¶

Goal¶

Interpretation Example:¶