import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
# -------------------------------
# 1. Load the CSV safely
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"
if os.path.exists(file_path):
df = pd.read_csv(file_path)
print("File loaded successfully!")
else:
raise FileNotFoundError(f"The file '{file_path}' does not exist in {os.getcwd()}")
# -------------------------------
# 2. Inspect the data
# -------------------------------
print(df.head())
print(df.info())
# -------------------------------
# 3. Select numeric columns only for PCA
# -------------------------------
numeric_df = df.select_dtypes(include=np.number)
# -------------------------------
# 4. Standardize the data
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)
# -------------------------------
# 5. Perform PCA (2 components for visualization)
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)
# Create a DataFrame with the principal components
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
# -------------------------------
# 6. Plot PCA results
# -------------------------------
plt.figure(figsize=(10, 7))
sns.scatterplot(x='PC1', y='PC2', data=pca_df, s=100)
plt.title('PCA of Student Habits Dataset')
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)")
plt.grid(True)
plt.show()
# -------------------------------
# 7. Explained variance
# -------------------------------
print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Cumulative variance explained:", np.cumsum(pca.explained_variance_ratio_))
File loaded successfully! student_id age gender major study_hours_per_day \ 0 100000 26 Male Computer Science 7.645367 1 100001 28 Male Arts 5.700000 2 100002 17 Male Arts 2.400000 3 100003 27 Other Psychology 3.400000 4 100004 25 Female Business 4.700000 social_media_hours netflix_hours part_time_job attendance_percentage \ 0 3.0 0.1 Yes 70.3 1 0.5 0.4 No 88.4 2 4.2 0.7 No 82.1 3 4.6 2.3 Yes 79.3 4 0.8 2.7 Yes 62.9 sleep_hours ... screen_time study_environment access_to_tutoring \ 0 6.2 ... 10.9 Co-Learning Group Yes 1 7.2 ... 8.3 Co-Learning Group Yes 2 9.2 ... 8.0 Library Yes 3 4.2 ... 11.7 Co-Learning Group Yes 4 6.5 ... 9.4 Quiet Room Yes family_income_range parental_support_level motivation_level \ 0 High 9 7 1 Low 7 2 2 High 3 9 3 Low 5 3 4 Medium 9 1 exam_anxiety_score learning_style time_management_score exam_score 0 8 Reading 3.0 100 1 10 Reading 6.0 99 2 6 Kinesthetic 7.6 98 3 10 Reading 3.2 100 4 10 Reading 7.1 98 [5 rows x 31 columns] <class 'pandas.core.frame.DataFrame'> RangeIndex: 80000 entries, 0 to 79999 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 student_id 80000 non-null int64 1 age 80000 non-null int64 2 gender 80000 non-null object 3 major 80000 non-null object 4 study_hours_per_day 80000 non-null float64 5 social_media_hours 80000 non-null float64 6 netflix_hours 80000 non-null float64 7 part_time_job 80000 non-null object 8 attendance_percentage 80000 non-null float64 9 sleep_hours 80000 non-null float64 10 diet_quality 80000 non-null object 11 exercise_frequency 80000 non-null int64 12 parental_education_level 80000 non-null object 13 internet_quality 80000 non-null object 14 mental_health_rating 80000 non-null float64 15 extracurricular_participation 80000 non-null object 16 previous_gpa 80000 non-null float64 17 semester 80000 non-null int64 18 stress_level 80000 non-null float64 19 dropout_risk 80000 non-null object 20 social_activity 80000 non-null int64 21 screen_time 80000 non-null float64 22 study_environment 80000 non-null object 23 access_to_tutoring 80000 non-null object 24 family_income_range 80000 non-null object 25 parental_support_level 80000 non-null int64 26 motivation_level 80000 non-null int64 27 exam_anxiety_score 80000 non-null int64 28 learning_style 80000 non-null object 29 time_management_score 80000 non-null float64 30 exam_score 80000 non-null int64 dtypes: float64(10), int64(9), object(12) memory usage: 18.9+ MB None
Explained variance ratio: [0.13650684 0.10294555] Cumulative variance explained: [0.13650684 0.2394524 ]
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
# -------------------------------
# 1. Load the CSV safely
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"
if os.path.exists(file_path):
df = pd.read_csv(file_path)
print("File loaded successfully!")
else:
raise FileNotFoundError(f"The file '{file_path}' does not exist in {os.getcwd()}")
# -------------------------------
# 2. Select numeric columns only for PCA
# -------------------------------
numeric_df = df.select_dtypes(include=np.number)
# -------------------------------
# 3. Standardize the data
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)
# -------------------------------
# 4. Perform PCA
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)
# Create a DataFrame for plotting
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
# -------------------------------
# 5. PCA Biplot
# -------------------------------
plt.figure(figsize=(12, 8))
# Plot students
sns.scatterplot(x='PC1', y='PC2', data=pca_df, s=100, color='lightcoral', alpha=0.7)
# Plot feature vectors (arrows)
for i, feature in enumerate(numeric_df.columns):
plt.arrow(0, 0,
pca.components_[0, i]*5, # scale arrows for visibility
pca.components_[1, i]*5,
color='black', alpha=0.7, head_width=0.2)
plt.text(pca.components_[0, i]*5.2,
pca.components_[1, i]*5.2,
feature, color='black', fontsize=10)
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
plt.title("PCA Biplot of Student Habits Dataset")
plt.grid(True)
plt.axhline(0, color='grey', lw=1)
plt.axvline(0, color='grey', lw=1)
plt.show()
File loaded successfully!
My data¶
I have a dataset where each row = a student, and each column = a habit or performance metric (e.g., study hours, sleep quality, participation, assignments completed).
Each student has values for all habits.
Some habits might be more important or vary more across students.
From ChatGPT:
What PCA does here¶
When you run PCA on this dataset:
Standardization: All habits are scaled so that differences in units (hours, scores, percentages) don’t dominate.
Find patterns: PCA identifies directions (principal components) that capture the largest differences between students.
Create new variables:
PC1 = the combination of habits that explains the most variation between students.
PC2 = the next combination that explains variation not captured by PC1.
These PCs are weighted sums of original habits, so each habit contributes to the PCs.
How to interpret the PCA biplot¶
Blue dots = individual students, positioned based on habits. Students close together have similar habits/performance.
Red arrows = habits/features, showing how strongly each habit influences the principal components.
Long arrows = habit strongly affects variation.
Direction of arrow = correlation with PCs.
Example interpretation:
If “study hours” arrow points mostly along PC1, then students’ variation in study hours is a major factor for PC1.
If “sleep quality” arrow points in the opposite direction to “study hours”, these habits are inversely related in the data.
Students near the tip of “study hours” arrow have high study hours, while those on the opposite side have low study hours.
Why this helps¶
You can see which habits matter most for distinguishing students.
You can group students with similar habits visually.
Helps identify patterns: e.g., students who study a lot and participate actively might cluster together.
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# -------------------------------
# 1. Load the CSV safely
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"
if os.path.exists(file_path):
df = pd.read_csv(file_path)
print("File loaded successfully!")
else:
raise FileNotFoundError(f"The file '{file_path}' does not exist in {os.getcwd()}")
# -------------------------------
# 2. Select numeric columns only for PCA
# -------------------------------
numeric_df = df.select_dtypes(include=np.number)
# -------------------------------
# 3. Standardize the data
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)
# -------------------------------
# 4. Perform PCA
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)
# -------------------------------
# 5. PCA Results: feature contributions
# -------------------------------
loadings = pd.DataFrame(pca.components_.T,
columns=['PC1', 'PC2'],
index=numeric_df.columns)
print("\nFeature contributions to each principal component:")
print(loadings)
# Identify top contributing habits for each PC
top_pc1 = loadings['PC1'].abs().sort_values(ascending=False)
top_pc2 = loadings['PC2'].abs().sort_values(ascending=False)
print("\nTop habits influencing PC1:")
print(top_pc1.head(5))
print("\nTop habits influencing PC2:")
print(top_pc2.head(5))
# -------------------------------
# 6. PCA Scores: student positions
# -------------------------------
pca_scores = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])
pca_scores['Student'] = df.index # or a column with student names if available
print("\nSample PCA scores (student positions on PC1 & PC2):")
print(pca_scores.head())
File loaded successfully!
Feature contributions to each principal component:
PC1 PC2
student_id 0.006059 -0.008229
age -0.001275 -0.005830
study_hours_per_day 0.289535 0.420753
social_media_hours 0.087663 0.314338
netflix_hours 0.069700 0.249439
attendance_percentage 0.004256 0.005197
sleep_hours 0.060277 -0.006677
exercise_frequency 0.059843 -0.015400
mental_health_rating 0.012731 0.002292
previous_gpa 0.513175 -0.029184
semester 0.004031 0.002440
stress_level -0.079213 0.006238
social_activity -0.002960 -0.004759
screen_time 0.288065 0.583309
parental_support_level -0.006723 0.000734
motivation_level 0.383732 -0.399906
exam_anxiety_score -0.377710 0.399598
time_management_score 0.007528 0.003614
exam_score 0.503534 -0.027142
Top habits influencing PC1:
previous_gpa 0.513175
exam_score 0.503534
motivation_level 0.383732
exam_anxiety_score 0.377710
study_hours_per_day 0.289535
Name: PC1, dtype: float64
Top habits influencing PC2:
screen_time 0.583309
study_hours_per_day 0.420753
motivation_level 0.399906
exam_anxiety_score 0.399598
social_media_hours 0.314338
Name: PC2, dtype: float64
Sample PCA scores (student positions on PC1 & PC2):
PC1 PC2 Student
0 1.659639 0.332473 0
1 -0.072835 0.028027 1
2 1.159825 -1.698820 2
3 0.394544 1.435675 3
4 -0.087100 0.748682 4
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
# Load data
df = pd.read_csv("datasets/enhanced_student_habits_performance_dataset.csv")
# Select numeric features
numeric_df = df.select_dtypes(include='number')
# Standardize
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)
# PCA (2 components)
pca = PCA(n_components=2)
pc = pca.fit_transform(scaled_data)
# Create PCA dataframe
pca_df = pd.DataFrame(pc, columns=['PC1', 'PC2'])
# Plot
plt.figure(figsize=(10, 7))
plt.scatter(pca_df['PC1'], pca_df['PC2'], s=80, alpha=0.6)
plt.title("Clear PCA Scatter Plot")
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)")
plt.grid(True)
plt.show()
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
# Load dataset
df = pd.read_csv("datasets/enhanced_student_habits_performance_dataset.csv")
# Select numeric data
numeric_df = df.select_dtypes(include=np.number)
# Standardize
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)
# PCA
pca = PCA(n_components=2)
pca.fit(scaled_data)
# Get loadings (feature contributions)
loadings = pd.DataFrame(
pca.components_.T,
columns=['PC1', 'PC2'],
index=numeric_df.columns
)
# Sort by absolute contribution
pc1_sorted = loadings['PC1'].abs().sort_values(ascending=False)
pc2_sorted = loadings['PC2'].abs().sort_values(ascending=False)
# -------------------------------
# Graph 1: PC1 loadings
# -------------------------------
plt.figure(figsize=(10, 6))
pc1_sorted.plot(kind='bar')
plt.title("Top Feature Contributions to PC1")
plt.ylabel("Absolute Loading Value")
plt.xlabel("Features (Habits)")
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
# -------------------------------
# Graph 2: PC2 loadings
# -------------------------------
plt.figure(figsize=(10, 6))
pc2_sorted.plot(kind='bar', color='orange')
plt.title("Top Feature Contributions to PC2")
plt.ylabel("Absolute Loading Value")
plt.xlabel("Features (Habits)")
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
# -------------------------------
# 1. Load dataset
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"
df = pd.read_csv(file_path)
# -------------------------------
# 2. Select numeric columns for PCA
# -------------------------------
numeric_df = df.select_dtypes(include=np.number)
# -------------------------------
# 3. Standardize
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)
# -------------------------------
# 4. PCA
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)
pca_df = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])
# -------------------------------
# 5. Add a "study hours" column
# Replace 'Study_Hours' with your actual column name
# -------------------------------
if 'Study_Hours' in df.columns:
pca_df['Study_Hours'] = df['Study_Hours']
else:
# Example: sum of all numeric features as a proxy
pca_df['Study_Hours'] = numeric_df.sum(axis=1)
# -------------------------------
# 6. Gradient PCA plot
# -------------------------------
plt.figure(figsize=(12,8))
scatter = plt.scatter(
x=pca_df['PC1'],
y=pca_df['PC2'],
c=pca_df['Study_Hours'], # gradient color
cmap='viridis', # choose your color map
s=80, alpha=0.7
)
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
plt.title("PCA of Students Colored by Study Hours")
plt.axhline(0, color='grey', lw=1)
plt.axvline(0, color='grey', lw=1)
plt.grid(True)
# Colorbar
cbar = plt.colorbar(scatter)
cbar.set_label("Study Hours (Low → High)")
plt.show()
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
# -------------------------------
# 1. Load dataset
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"
df = pd.read_csv(file_path)
# -------------------------------
# 2. Select numeric columns for PCA
# -------------------------------
numeric_df = df.select_dtypes(include=np.number)
# -------------------------------
# 3. Prepare 10,000 samples
# -------------------------------
if len(numeric_df) < 10000:
repeats = int(np.ceil(10000 / len(numeric_df)))
numeric_df = pd.concat([numeric_df] * repeats, ignore_index=True).head(10000)
else:
numeric_df = numeric_df.sample(n=10000, random_state=42)
# -------------------------------
# 4. Standardize
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)
# -------------------------------
# 5. PCA
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)
pca_df = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])
# -------------------------------
# 6. Add a "study hours" column
# Replace 'Study_Hours' with your actual column name
# -------------------------------
if 'Study_Hours' in df.columns:
# Repeat or sample the Study_Hours column to match 10k rows
study_hours = df['Study_Hours']
if len(study_hours) < 10000:
study_hours = pd.concat([study_hours] * repeats, ignore_index=True).head(10000)
else:
study_hours = study_hours.sample(n=10000, random_state=42).reset_index(drop=True)
pca_df['Study_Hours'] = study_hours
else:
pca_df['Study_Hours'] = numeric_df.sum(axis=1)
# -------------------------------
# 7. Gradient PCA plot (hexbin for clarity)
# -------------------------------
plt.figure(figsize=(12,8))
hb = plt.hexbin(
x=pca_df['PC1'],
y=pca_df['PC2'],
C=pca_df['Study_Hours'], # gradient
gridsize=60,
cmap='viridis',
reduce_C_function=np.mean
)
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
plt.title("PCA of 10,000 Students Colored by Study Hours")
plt.axhline(0, color='grey', lw=1)
plt.axvline(0, color='grey', lw=1)
plt.grid(False)
# Colorbar
cbar = plt.colorbar(hb)
cbar.set_label("Study Hours (Low → High)")
plt.tight_layout()
plt.show()
PC1 = overall student lifestyle intensity
PC1 is likely dominated by study_hours, sleep_hours, and screen_time, because these have the highest variance.
It represents a spectrum from:
➡ Low engagement lifestyle (low study, irregular sleep, high screen time) to ➡ High engagement lifestyle (good study habits, consistent sleep, balanced activities)
PC2 = performance vs habits balance
PC2 is usually driven by:
exam_score
assignment_completion
consistency metrics
PC2 often shows:
➡ Students who have good habits but average performance vs ➡ Students who have high performance regardless of habits
We would need your actual loading values to confirm—but this is the standard pattern for education datasets.
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
# -------------------------------
# 1. Load the CSV safely
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"
if os.path.exists(file_path):
df = pd.read_csv(file_path)
print("File loaded successfully!")
else:
raise FileNotFoundError(f"The file '{file_path}' does not exist in {os.getcwd()}")
# -------------------------------
# 2. Clean column names: lowercase, replace spaces with underscores
# -------------------------------
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
print("Columns after cleaning:", df.columns)
# -------------------------------
# 3. Select the 3 desired factors
# -------------------------------
numeric_df = df[['study_hours_per_day', 'social_media_hours', 'previous_gpa']]
# -------------------------------
# 4. Sample 1000 rows
# -------------------------------
numeric_df = numeric_df.sample(n=1000, random_state=42)
# -------------------------------
# 5. Standardize
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)
# -------------------------------
# 6. PCA
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)
pca_df = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])
# -------------------------------
# 7. PCA Biplot
# -------------------------------
plt.figure(figsize=(12, 8))
# Scatter plot for 1000 samples
sns.scatterplot(x='PC1', y='PC2', data=pca_df, s=80, color='skyblue', alpha=0.7)
# Feature arrows
for i, feature in enumerate(['study_hours_per_day', 'social_media_hours', 'previous_gpa']):
plt.arrow(0, 0,
pca.components_[0, i]*5,
pca.components_[1, i]*5,
color='black', alpha=0.8, head_width=0.2)
plt.text(pca.components_[0, i]*5.2,
pca.components_[1, i]*5.2,
feature,
fontsize=12, color='black')
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
plt.title("PCA Biplot (1000 Samples: Study, Social Media & Previous GPA)")
plt.axhline(0, color='grey', lw=1)
plt.axvline(0, color='grey', lw=1)
plt.grid(True)
plt.tight_layout()
plt.show()
File loaded successfully!
Columns after cleaning: Index(['student_id', 'age', 'gender', 'major', 'study_hours_per_day',
'social_media_hours', 'netflix_hours', 'part_time_job',
'attendance_percentage', 'sleep_hours', 'diet_quality',
'exercise_frequency', 'parental_education_level', 'internet_quality',
'mental_health_rating', 'extracurricular_participation', 'previous_gpa',
'semester', 'stress_level', 'dropout_risk', 'social_activity',
'screen_time', 'study_environment', 'access_to_tutoring',
'family_income_range', 'parental_support_level', 'motivation_level',
'exam_anxiety_score', 'learning_style', 'time_management_score',
'exam_score'],
dtype='object')