import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------
# 1. Load the CSV safely
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"

if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print("File loaded successfully!")
else:
    raise FileNotFoundError(f"The file '{file_path}' does not exist in {os.getcwd()}")

# -------------------------------
# 2. Inspect the data
# -------------------------------
print(df.head())
print(df.info())

# -------------------------------
# 3. Select numeric columns only for PCA
# -------------------------------
numeric_df = df.select_dtypes(include=np.number)

# -------------------------------
# 4. Standardize the data
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# -------------------------------
# 5. Perform PCA (2 components for visualization)
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)

# Create a DataFrame with the principal components
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# -------------------------------
# 6. Plot PCA results
# -------------------------------
plt.figure(figsize=(10, 7))
sns.scatterplot(x='PC1', y='PC2', data=pca_df, s=100)
plt.title('PCA of Student Habits Dataset')
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)")
plt.grid(True)
plt.show()

# -------------------------------
# 7. Explained variance
# -------------------------------
print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Cumulative variance explained:", np.cumsum(pca.explained_variance_ratio_))

File loaded successfully!
   student_id  age  gender             major  study_hours_per_day  \
0      100000   26    Male  Computer Science             7.645367   
1      100001   28    Male              Arts             5.700000   
2      100002   17    Male              Arts             2.400000   
3      100003   27   Other        Psychology             3.400000   
4      100004   25  Female          Business             4.700000   

   social_media_hours  netflix_hours part_time_job  attendance_percentage  \
0                 3.0            0.1           Yes                   70.3   
1                 0.5            0.4            No                   88.4   
2                 4.2            0.7            No                   82.1   
3                 4.6            2.3           Yes                   79.3   
4                 0.8            2.7           Yes                   62.9   

   sleep_hours  ... screen_time  study_environment access_to_tutoring  \
0          6.2  ...        10.9  Co-Learning Group                Yes   
1          7.2  ...         8.3  Co-Learning Group                Yes   
2          9.2  ...         8.0            Library                Yes   
3          4.2  ...        11.7  Co-Learning Group                Yes   
4          6.5  ...         9.4         Quiet Room                Yes   

  family_income_range  parental_support_level motivation_level  \
0                High                       9                7   
1                 Low                       7                2   
2                High                       3                9   
3                 Low                       5                3   
4              Medium                       9                1   

   exam_anxiety_score  learning_style  time_management_score exam_score  
0                   8         Reading                    3.0        100  
1                  10         Reading                    6.0         99  
2                   6     Kinesthetic                    7.6         98  
3                  10         Reading                    3.2        100  
4                  10         Reading                    7.1         98  

[5 rows x 31 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 31 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   student_id                     80000 non-null  int64  
 1   age                            80000 non-null  int64  
 2   gender                         80000 non-null  object 
 3   major                          80000 non-null  object 
 4   study_hours_per_day            80000 non-null  float64
 5   social_media_hours             80000 non-null  float64
 6   netflix_hours                  80000 non-null  float64
 7   part_time_job                  80000 non-null  object 
 8   attendance_percentage          80000 non-null  float64
 9   sleep_hours                    80000 non-null  float64
 10  diet_quality                   80000 non-null  object 
 11  exercise_frequency             80000 non-null  int64  
 12  parental_education_level       80000 non-null  object 
 13  internet_quality               80000 non-null  object 
 14  mental_health_rating           80000 non-null  float64
 15  extracurricular_participation  80000 non-null  object 
 16  previous_gpa                   80000 non-null  float64
 17  semester                       80000 non-null  int64  
 18  stress_level                   80000 non-null  float64
 19  dropout_risk                   80000 non-null  object 
 20  social_activity                80000 non-null  int64  
 21  screen_time                    80000 non-null  float64
 22  study_environment              80000 non-null  object 
 23  access_to_tutoring             80000 non-null  object 
 24  family_income_range            80000 non-null  object 
 25  parental_support_level         80000 non-null  int64  
 26  motivation_level               80000 non-null  int64  
 27  exam_anxiety_score             80000 non-null  int64  
 28  learning_style                 80000 non-null  object 
 29  time_management_score          80000 non-null  float64
 30  exam_score                     80000 non-null  int64  
dtypes: float64(10), int64(9), object(12)
memory usage: 18.9+ MB
None

Explained variance ratio: [0.13650684 0.10294555]
Cumulative variance explained: [0.13650684 0.2394524 ]

import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------
# 1. Load the CSV safely
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"

if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print("File loaded successfully!")
else:
    raise FileNotFoundError(f"The file '{file_path}' does not exist in {os.getcwd()}")

# -------------------------------
# 2. Select numeric columns only for PCA
# -------------------------------
numeric_df = df.select_dtypes(include=np.number)

# -------------------------------
# 3. Standardize the data
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# -------------------------------
# 4. Perform PCA
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)

# Create a DataFrame for plotting
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# -------------------------------
# 5. PCA Biplot
# -------------------------------
plt.figure(figsize=(12, 8))

# Plot students
sns.scatterplot(x='PC1', y='PC2', data=pca_df, s=100, color='lightcoral', alpha=0.7)

# Plot feature vectors (arrows)
for i, feature in enumerate(numeric_df.columns):
    plt.arrow(0, 0,
              pca.components_[0, i]*5,  # scale arrows for visibility
              pca.components_[1, i]*5,
              color='black', alpha=0.7, head_width=0.2)
    plt.text(pca.components_[0, i]*5.2,
             pca.components_[1, i]*5.2,
             feature, color='black', fontsize=10)

plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
plt.title("PCA Biplot of Student Habits Dataset")
plt.grid(True)
plt.axhline(0, color='grey', lw=1)
plt.axvline(0, color='grey', lw=1)
plt.show()

File loaded successfully!

import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# -------------------------------
# 1. Load the CSV safely
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"

if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print("File loaded successfully!")
else:
    raise FileNotFoundError(f"The file '{file_path}' does not exist in {os.getcwd()}")

# -------------------------------
# 2. Select numeric columns only for PCA
# -------------------------------
numeric_df = df.select_dtypes(include=np.number)

# -------------------------------
# 3. Standardize the data
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# -------------------------------
# 4. Perform PCA
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)

# -------------------------------
# 5. PCA Results: feature contributions
# -------------------------------
loadings = pd.DataFrame(pca.components_.T, 
                        columns=['PC1', 'PC2'], 
                        index=numeric_df.columns)
print("\nFeature contributions to each principal component:")
print(loadings)

# Identify top contributing habits for each PC
top_pc1 = loadings['PC1'].abs().sort_values(ascending=False)
top_pc2 = loadings['PC2'].abs().sort_values(ascending=False)

print("\nTop habits influencing PC1:")
print(top_pc1.head(5))
print("\nTop habits influencing PC2:")
print(top_pc2.head(5))

# -------------------------------
# 6. PCA Scores: student positions
# -------------------------------
pca_scores = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])
pca_scores['Student'] = df.index  # or a column with student names if available

print("\nSample PCA scores (student positions on PC1 & PC2):")
print(pca_scores.head())

File loaded successfully!

Feature contributions to each principal component:
                             PC1       PC2
student_id              0.006059 -0.008229
age                    -0.001275 -0.005830
study_hours_per_day     0.289535  0.420753
social_media_hours      0.087663  0.314338
netflix_hours           0.069700  0.249439
attendance_percentage   0.004256  0.005197
sleep_hours             0.060277 -0.006677
exercise_frequency      0.059843 -0.015400
mental_health_rating    0.012731  0.002292
previous_gpa            0.513175 -0.029184
semester                0.004031  0.002440
stress_level           -0.079213  0.006238
social_activity        -0.002960 -0.004759
screen_time             0.288065  0.583309
parental_support_level -0.006723  0.000734
motivation_level        0.383732 -0.399906
exam_anxiety_score     -0.377710  0.399598
time_management_score   0.007528  0.003614
exam_score              0.503534 -0.027142

Top habits influencing PC1:
previous_gpa           0.513175
exam_score             0.503534
motivation_level       0.383732
exam_anxiety_score     0.377710
study_hours_per_day    0.289535
Name: PC1, dtype: float64

Top habits influencing PC2:
screen_time            0.583309
study_hours_per_day    0.420753
motivation_level       0.399906
exam_anxiety_score     0.399598
social_media_hours     0.314338
Name: PC2, dtype: float64

Sample PCA scores (student positions on PC1 & PC2):
        PC1       PC2  Student
0  1.659639  0.332473        0
1 -0.072835  0.028027        1
2  1.159825 -1.698820        2
3  0.394544  1.435675        3
4 -0.087100  0.748682        4

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv("datasets/enhanced_student_habits_performance_dataset.csv")

# Select numeric features
numeric_df = df.select_dtypes(include='number')

# Standardize
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# PCA (2 components)
pca = PCA(n_components=2)
pc = pca.fit_transform(scaled_data)

# Create PCA dataframe
pca_df = pd.DataFrame(pc, columns=['PC1', 'PC2'])

# Plot
plt.figure(figsize=(10, 7))
plt.scatter(pca_df['PC1'], pca_df['PC2'], s=80, alpha=0.6)

plt.title("Clear PCA Scatter Plot")
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)")
plt.grid(True)
plt.show()

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv("datasets/enhanced_student_habits_performance_dataset.csv")

# Select numeric data
numeric_df = df.select_dtypes(include=np.number)

# Standardize
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# PCA
pca = PCA(n_components=2)
pca.fit(scaled_data)

# Get loadings (feature contributions)
loadings = pd.DataFrame(
    pca.components_.T,
    columns=['PC1', 'PC2'],
    index=numeric_df.columns
)

# Sort by absolute contribution
pc1_sorted = loadings['PC1'].abs().sort_values(ascending=False)
pc2_sorted = loadings['PC2'].abs().sort_values(ascending=False)

# -------------------------------
# Graph 1: PC1 loadings
# -------------------------------
plt.figure(figsize=(10, 6))
pc1_sorted.plot(kind='bar')
plt.title("Top Feature Contributions to PC1")
plt.ylabel("Absolute Loading Value")
plt.xlabel("Features (Habits)")
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

# -------------------------------
# Graph 2: PC2 loadings
# -------------------------------
plt.figure(figsize=(10, 6))
pc2_sorted.plot(kind='bar', color='orange')
plt.title("Top Feature Contributions to PC2")
plt.ylabel("Absolute Loading Value")
plt.xlabel("Features (Habits)")
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------
# 1. Load dataset
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"
df = pd.read_csv(file_path)

# -------------------------------
# 2. Select numeric columns for PCA
# -------------------------------
numeric_df = df.select_dtypes(include=np.number)

# -------------------------------
# 3. Standardize
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# -------------------------------
# 4. PCA
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)

pca_df = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])

# -------------------------------
# 5. Add a "study hours" column
# Replace 'Study_Hours' with your actual column name
# -------------------------------
if 'Study_Hours' in df.columns:
    pca_df['Study_Hours'] = df['Study_Hours']
else:
    # Example: sum of all numeric features as a proxy
    pca_df['Study_Hours'] = numeric_df.sum(axis=1)

# -------------------------------
# 6. Gradient PCA plot
# -------------------------------
plt.figure(figsize=(12,8))
scatter = plt.scatter(
    x=pca_df['PC1'], 
    y=pca_df['PC2'], 
    c=pca_df['Study_Hours'],      # gradient color
    cmap='viridis',                # choose your color map
    s=80, alpha=0.7
)
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
plt.title("PCA of Students Colored by Study Hours")
plt.axhline(0, color='grey', lw=1)
plt.axvline(0, color='grey', lw=1)
plt.grid(True)

# Colorbar
cbar = plt.colorbar(scatter)
cbar.set_label("Study Hours (Low → High)")

plt.show()

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------
# 1. Load dataset
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"
df = pd.read_csv(file_path)

# -------------------------------
# 2. Select numeric columns for PCA
# -------------------------------
numeric_df = df.select_dtypes(include=np.number)

# -------------------------------
# 3. Prepare 10,000 samples
# -------------------------------
if len(numeric_df) < 10000:
    repeats = int(np.ceil(10000 / len(numeric_df)))
    numeric_df = pd.concat([numeric_df] * repeats, ignore_index=True).head(10000)
else:
    numeric_df = numeric_df.sample(n=10000, random_state=42)

# -------------------------------
# 4. Standardize
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# -------------------------------
# 5. PCA
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)

pca_df = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])

# -------------------------------
# 6. Add a "study hours" column
# Replace 'Study_Hours' with your actual column name
# -------------------------------
if 'Study_Hours' in df.columns:
    # Repeat or sample the Study_Hours column to match 10k rows
    study_hours = df['Study_Hours']
    if len(study_hours) < 10000:
        study_hours = pd.concat([study_hours] * repeats, ignore_index=True).head(10000)
    else:
        study_hours = study_hours.sample(n=10000, random_state=42).reset_index(drop=True)
    pca_df['Study_Hours'] = study_hours
else:
    pca_df['Study_Hours'] = numeric_df.sum(axis=1)

# -------------------------------
# 7. Gradient PCA plot (hexbin for clarity)
# -------------------------------
plt.figure(figsize=(12,8))
hb = plt.hexbin(
    x=pca_df['PC1'], 
    y=pca_df['PC2'], 
    C=pca_df['Study_Hours'],   # gradient
    gridsize=60, 
    cmap='viridis', 
    reduce_C_function=np.mean
)
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
plt.title("PCA of 10,000 Students Colored by Study Hours")
plt.axhline(0, color='grey', lw=1)
plt.axvline(0, color='grey', lw=1)
plt.grid(False)

# Colorbar
cbar = plt.colorbar(hb)
cbar.set_label("Study Hours (Low → High)")

plt.tight_layout()
plt.show()

import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------
# 1. Load the CSV safely
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"

if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print("File loaded successfully!")
else:
    raise FileNotFoundError(f"The file '{file_path}' does not exist in {os.getcwd()}")

# -------------------------------
# 2. Clean column names: lowercase, replace spaces with underscores
# -------------------------------
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
print("Columns after cleaning:", df.columns)

# -------------------------------
# 3. Select the 3 desired factors
# -------------------------------
numeric_df = df[['study_hours_per_day', 'social_media_hours', 'previous_gpa']]

# -------------------------------
# 4. Sample 1000 rows
# -------------------------------
numeric_df = numeric_df.sample(n=1000, random_state=42)

# -------------------------------
# 5. Standardize
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# -------------------------------
# 6. PCA
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)

pca_df = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])

# -------------------------------
# 7. PCA Biplot
# -------------------------------
plt.figure(figsize=(12, 8))

# Scatter plot for 1000 samples
sns.scatterplot(x='PC1', y='PC2', data=pca_df, s=80, color='skyblue', alpha=0.7)

# Feature arrows
for i, feature in enumerate(['study_hours_per_day', 'social_media_hours', 'previous_gpa']):
    plt.arrow(0, 0,
              pca.components_[0, i]*5,
              pca.components_[1, i]*5,
              color='black', alpha=0.8, head_width=0.2)
    plt.text(pca.components_[0, i]*5.2,
             pca.components_[1, i]*5.2,
             feature,
             fontsize=12, color='black')

plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
plt.title("PCA Biplot (1000 Samples: Study, Social Media & Previous GPA)")
plt.axhline(0, color='grey', lw=1)
plt.axvline(0, color='grey', lw=1)
plt.grid(True)
plt.tight_layout()
plt.show()

File loaded successfully!
Columns after cleaning: Index(['student_id', 'age', 'gender', 'major', 'study_hours_per_day',
       'social_media_hours', 'netflix_hours', 'part_time_job',
       'attendance_percentage', 'sleep_hours', 'diet_quality',
       'exercise_frequency', 'parental_education_level', 'internet_quality',
       'mental_health_rating', 'extracurricular_participation', 'previous_gpa',
       'semester', 'stress_level', 'dropout_risk', 'social_activity',
       'screen_time', 'study_environment', 'access_to_tutoring',
       'family_income_range', 'parental_support_level', 'motivation_level',
       'exam_anxiety_score', 'learning_style', 'time_management_score',
       'exam_score'],
      dtype='object')

My data¶

What PCA does here¶

How to interpret the PCA biplot¶

Why this helps¶