[Sonam Zam Rinzin] - Fab Futures - Data Science
Home About
In [5]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------
# 1. Load the CSV safely
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"

if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print("File loaded successfully!")
else:
    raise FileNotFoundError(f"The file '{file_path}' does not exist in {os.getcwd()}")

# -------------------------------
# 2. Inspect the data
# -------------------------------
print(df.head())
print(df.info())

# -------------------------------
# 3. Select numeric columns only for PCA
# -------------------------------
numeric_df = df.select_dtypes(include=np.number)

# -------------------------------
# 4. Standardize the data
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# -------------------------------
# 5. Perform PCA (2 components for visualization)
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)

# Create a DataFrame with the principal components
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# -------------------------------
# 6. Plot PCA results
# -------------------------------
plt.figure(figsize=(10, 7))
sns.scatterplot(x='PC1', y='PC2', data=pca_df, s=100)
plt.title('PCA of Student Habits Dataset')
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)")
plt.grid(True)
plt.show()

# -------------------------------
# 7. Explained variance
# -------------------------------
print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Cumulative variance explained:", np.cumsum(pca.explained_variance_ratio_))
File loaded successfully!
   student_id  age  gender             major  study_hours_per_day  \
0      100000   26    Male  Computer Science             7.645367   
1      100001   28    Male              Arts             5.700000   
2      100002   17    Male              Arts             2.400000   
3      100003   27   Other        Psychology             3.400000   
4      100004   25  Female          Business             4.700000   

   social_media_hours  netflix_hours part_time_job  attendance_percentage  \
0                 3.0            0.1           Yes                   70.3   
1                 0.5            0.4            No                   88.4   
2                 4.2            0.7            No                   82.1   
3                 4.6            2.3           Yes                   79.3   
4                 0.8            2.7           Yes                   62.9   

   sleep_hours  ... screen_time  study_environment access_to_tutoring  \
0          6.2  ...        10.9  Co-Learning Group                Yes   
1          7.2  ...         8.3  Co-Learning Group                Yes   
2          9.2  ...         8.0            Library                Yes   
3          4.2  ...        11.7  Co-Learning Group                Yes   
4          6.5  ...         9.4         Quiet Room                Yes   

  family_income_range  parental_support_level motivation_level  \
0                High                       9                7   
1                 Low                       7                2   
2                High                       3                9   
3                 Low                       5                3   
4              Medium                       9                1   

   exam_anxiety_score  learning_style  time_management_score exam_score  
0                   8         Reading                    3.0        100  
1                  10         Reading                    6.0         99  
2                   6     Kinesthetic                    7.6         98  
3                  10         Reading                    3.2        100  
4                  10         Reading                    7.1         98  

[5 rows x 31 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 31 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   student_id                     80000 non-null  int64  
 1   age                            80000 non-null  int64  
 2   gender                         80000 non-null  object 
 3   major                          80000 non-null  object 
 4   study_hours_per_day            80000 non-null  float64
 5   social_media_hours             80000 non-null  float64
 6   netflix_hours                  80000 non-null  float64
 7   part_time_job                  80000 non-null  object 
 8   attendance_percentage          80000 non-null  float64
 9   sleep_hours                    80000 non-null  float64
 10  diet_quality                   80000 non-null  object 
 11  exercise_frequency             80000 non-null  int64  
 12  parental_education_level       80000 non-null  object 
 13  internet_quality               80000 non-null  object 
 14  mental_health_rating           80000 non-null  float64
 15  extracurricular_participation  80000 non-null  object 
 16  previous_gpa                   80000 non-null  float64
 17  semester                       80000 non-null  int64  
 18  stress_level                   80000 non-null  float64
 19  dropout_risk                   80000 non-null  object 
 20  social_activity                80000 non-null  int64  
 21  screen_time                    80000 non-null  float64
 22  study_environment              80000 non-null  object 
 23  access_to_tutoring             80000 non-null  object 
 24  family_income_range            80000 non-null  object 
 25  parental_support_level         80000 non-null  int64  
 26  motivation_level               80000 non-null  int64  
 27  exam_anxiety_score             80000 non-null  int64  
 28  learning_style                 80000 non-null  object 
 29  time_management_score          80000 non-null  float64
 30  exam_score                     80000 non-null  int64  
dtypes: float64(10), int64(9), object(12)
memory usage: 18.9+ MB
None
No description has been provided for this image
Explained variance ratio: [0.13650684 0.10294555]
Cumulative variance explained: [0.13650684 0.2394524 ]
In [32]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------
# 1. Load the CSV safely
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"

if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print("File loaded successfully!")
else:
    raise FileNotFoundError(f"The file '{file_path}' does not exist in {os.getcwd()}")

# -------------------------------
# 2. Select numeric columns only for PCA
# -------------------------------
numeric_df = df.select_dtypes(include=np.number)

# -------------------------------
# 3. Standardize the data
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# -------------------------------
# 4. Perform PCA
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)

# Create a DataFrame for plotting
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# -------------------------------
# 5. PCA Biplot
# -------------------------------
plt.figure(figsize=(12, 8))

# Plot students
sns.scatterplot(x='PC1', y='PC2', data=pca_df, s=100, color='lightcoral', alpha=0.7)

# Plot feature vectors (arrows)
for i, feature in enumerate(numeric_df.columns):
    plt.arrow(0, 0,
              pca.components_[0, i]*5,  # scale arrows for visibility
              pca.components_[1, i]*5,
              color='black', alpha=0.7, head_width=0.2)
    plt.text(pca.components_[0, i]*5.2,
             pca.components_[1, i]*5.2,
             feature, color='black', fontsize=10)

plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
plt.title("PCA Biplot of Student Habits Dataset")
plt.grid(True)
plt.axhline(0, color='grey', lw=1)
plt.axvline(0, color='grey', lw=1)
plt.show()
File loaded successfully!
No description has been provided for this image

My data¶

I have a dataset where each row = a student, and each column = a habit or performance metric (e.g., study hours, sleep quality, participation, assignments completed).

Each student has values for all habits.

Some habits might be more important or vary more across students.

From ChatGPT:

What PCA does here¶

When you run PCA on this dataset:

Standardization: All habits are scaled so that differences in units (hours, scores, percentages) don’t dominate.

Find patterns: PCA identifies directions (principal components) that capture the largest differences between students.

Create new variables:

PC1 = the combination of habits that explains the most variation between students.

PC2 = the next combination that explains variation not captured by PC1.

These PCs are weighted sums of original habits, so each habit contributes to the PCs.

How to interpret the PCA biplot¶

Blue dots = individual students, positioned based on habits. Students close together have similar habits/performance.

Red arrows = habits/features, showing how strongly each habit influences the principal components.

Long arrows = habit strongly affects variation.

Direction of arrow = correlation with PCs.

Example interpretation:

If “study hours” arrow points mostly along PC1, then students’ variation in study hours is a major factor for PC1.

If “sleep quality” arrow points in the opposite direction to “study hours”, these habits are inversely related in the data.

Students near the tip of “study hours” arrow have high study hours, while those on the opposite side have low study hours.

Why this helps¶

You can see which habits matter most for distinguishing students.

You can group students with similar habits visually.

Helps identify patterns: e.g., students who study a lot and participate actively might cluster together.

In [8]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# -------------------------------
# 1. Load the CSV safely
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"

if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print("File loaded successfully!")
else:
    raise FileNotFoundError(f"The file '{file_path}' does not exist in {os.getcwd()}")

# -------------------------------
# 2. Select numeric columns only for PCA
# -------------------------------
numeric_df = df.select_dtypes(include=np.number)

# -------------------------------
# 3. Standardize the data
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# -------------------------------
# 4. Perform PCA
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)

# -------------------------------
# 5. PCA Results: feature contributions
# -------------------------------
loadings = pd.DataFrame(pca.components_.T, 
                        columns=['PC1', 'PC2'], 
                        index=numeric_df.columns)
print("\nFeature contributions to each principal component:")
print(loadings)

# Identify top contributing habits for each PC
top_pc1 = loadings['PC1'].abs().sort_values(ascending=False)
top_pc2 = loadings['PC2'].abs().sort_values(ascending=False)

print("\nTop habits influencing PC1:")
print(top_pc1.head(5))
print("\nTop habits influencing PC2:")
print(top_pc2.head(5))

# -------------------------------
# 6. PCA Scores: student positions
# -------------------------------
pca_scores = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])
pca_scores['Student'] = df.index  # or a column with student names if available

print("\nSample PCA scores (student positions on PC1 & PC2):")
print(pca_scores.head())
File loaded successfully!

Feature contributions to each principal component:
                             PC1       PC2
student_id              0.006059 -0.008229
age                    -0.001275 -0.005830
study_hours_per_day     0.289535  0.420753
social_media_hours      0.087663  0.314338
netflix_hours           0.069700  0.249439
attendance_percentage   0.004256  0.005197
sleep_hours             0.060277 -0.006677
exercise_frequency      0.059843 -0.015400
mental_health_rating    0.012731  0.002292
previous_gpa            0.513175 -0.029184
semester                0.004031  0.002440
stress_level           -0.079213  0.006238
social_activity        -0.002960 -0.004759
screen_time             0.288065  0.583309
parental_support_level -0.006723  0.000734
motivation_level        0.383732 -0.399906
exam_anxiety_score     -0.377710  0.399598
time_management_score   0.007528  0.003614
exam_score              0.503534 -0.027142

Top habits influencing PC1:
previous_gpa           0.513175
exam_score             0.503534
motivation_level       0.383732
exam_anxiety_score     0.377710
study_hours_per_day    0.289535
Name: PC1, dtype: float64

Top habits influencing PC2:
screen_time            0.583309
study_hours_per_day    0.420753
motivation_level       0.399906
exam_anxiety_score     0.399598
social_media_hours     0.314338
Name: PC2, dtype: float64

Sample PCA scores (student positions on PC1 & PC2):
        PC1       PC2  Student
0  1.659639  0.332473        0
1 -0.072835  0.028027        1
2  1.159825 -1.698820        2
3  0.394544  1.435675        3
4 -0.087100  0.748682        4
In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv("datasets/enhanced_student_habits_performance_dataset.csv")

# Select numeric features
numeric_df = df.select_dtypes(include='number')

# Standardize
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# PCA (2 components)
pca = PCA(n_components=2)
pc = pca.fit_transform(scaled_data)

# Create PCA dataframe
pca_df = pd.DataFrame(pc, columns=['PC1', 'PC2'])

# Plot
plt.figure(figsize=(10, 7))
plt.scatter(pca_df['PC1'], pca_df['PC2'], s=80, alpha=0.6)

plt.title("Clear PCA Scatter Plot")
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)")
plt.grid(True)
plt.show()
No description has been provided for this image
In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv("datasets/enhanced_student_habits_performance_dataset.csv")

# Select numeric data
numeric_df = df.select_dtypes(include=np.number)

# Standardize
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# PCA
pca = PCA(n_components=2)
pca.fit(scaled_data)

# Get loadings (feature contributions)
loadings = pd.DataFrame(
    pca.components_.T,
    columns=['PC1', 'PC2'],
    index=numeric_df.columns
)

# Sort by absolute contribution
pc1_sorted = loadings['PC1'].abs().sort_values(ascending=False)
pc2_sorted = loadings['PC2'].abs().sort_values(ascending=False)

# -------------------------------
# Graph 1: PC1 loadings
# -------------------------------
plt.figure(figsize=(10, 6))
pc1_sorted.plot(kind='bar')
plt.title("Top Feature Contributions to PC1")
plt.ylabel("Absolute Loading Value")
plt.xlabel("Features (Habits)")
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

# -------------------------------
# Graph 2: PC2 loadings
# -------------------------------
plt.figure(figsize=(10, 6))
pc2_sorted.plot(kind='bar', color='orange')
plt.title("Top Feature Contributions to PC2")
plt.ylabel("Absolute Loading Value")
plt.xlabel("Features (Habits)")
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
No description has been provided for this image
No description has been provided for this image
In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------
# 1. Load dataset
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"
df = pd.read_csv(file_path)

# -------------------------------
# 2. Select numeric columns for PCA
# -------------------------------
numeric_df = df.select_dtypes(include=np.number)

# -------------------------------
# 3. Standardize
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# -------------------------------
# 4. PCA
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)

pca_df = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])

# -------------------------------
# 5. Add a "study hours" column
# Replace 'Study_Hours' with your actual column name
# -------------------------------
if 'Study_Hours' in df.columns:
    pca_df['Study_Hours'] = df['Study_Hours']
else:
    # Example: sum of all numeric features as a proxy
    pca_df['Study_Hours'] = numeric_df.sum(axis=1)

# -------------------------------
# 6. Gradient PCA plot
# -------------------------------
plt.figure(figsize=(12,8))
scatter = plt.scatter(
    x=pca_df['PC1'], 
    y=pca_df['PC2'], 
    c=pca_df['Study_Hours'],      # gradient color
    cmap='viridis',                # choose your color map
    s=80, alpha=0.7
)
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
plt.title("PCA of Students Colored by Study Hours")
plt.axhline(0, color='grey', lw=1)
plt.axvline(0, color='grey', lw=1)
plt.grid(True)

# Colorbar
cbar = plt.colorbar(scatter)
cbar.set_label("Study Hours (Low → High)")

plt.show()
No description has been provided for this image
In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------
# 1. Load dataset
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"
df = pd.read_csv(file_path)

# -------------------------------
# 2. Select numeric columns for PCA
# -------------------------------
numeric_df = df.select_dtypes(include=np.number)

# -------------------------------
# 3. Prepare 10,000 samples
# -------------------------------
if len(numeric_df) < 10000:
    repeats = int(np.ceil(10000 / len(numeric_df)))
    numeric_df = pd.concat([numeric_df] * repeats, ignore_index=True).head(10000)
else:
    numeric_df = numeric_df.sample(n=10000, random_state=42)

# -------------------------------
# 4. Standardize
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# -------------------------------
# 5. PCA
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)

pca_df = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])

# -------------------------------
# 6. Add a "study hours" column
# Replace 'Study_Hours' with your actual column name
# -------------------------------
if 'Study_Hours' in df.columns:
    # Repeat or sample the Study_Hours column to match 10k rows
    study_hours = df['Study_Hours']
    if len(study_hours) < 10000:
        study_hours = pd.concat([study_hours] * repeats, ignore_index=True).head(10000)
    else:
        study_hours = study_hours.sample(n=10000, random_state=42).reset_index(drop=True)
    pca_df['Study_Hours'] = study_hours
else:
    pca_df['Study_Hours'] = numeric_df.sum(axis=1)

# -------------------------------
# 7. Gradient PCA plot (hexbin for clarity)
# -------------------------------
plt.figure(figsize=(12,8))
hb = plt.hexbin(
    x=pca_df['PC1'], 
    y=pca_df['PC2'], 
    C=pca_df['Study_Hours'],   # gradient
    gridsize=60, 
    cmap='viridis', 
    reduce_C_function=np.mean
)
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
plt.title("PCA of 10,000 Students Colored by Study Hours")
plt.axhline(0, color='grey', lw=1)
plt.axvline(0, color='grey', lw=1)
plt.grid(False)

# Colorbar
cbar = plt.colorbar(hb)
cbar.set_label("Study Hours (Low → High)")

plt.tight_layout()
plt.show()
No description has been provided for this image

PC1 = overall student lifestyle intensity

PC1 is likely dominated by study_hours, sleep_hours, and screen_time, because these have the highest variance.

It represents a spectrum from:

➡ Low engagement lifestyle (low study, irregular sleep, high screen time) to ➡ High engagement lifestyle (good study habits, consistent sleep, balanced activities)

PC2 = performance vs habits balance

PC2 is usually driven by:

exam_score

assignment_completion

consistency metrics

PC2 often shows:

➡ Students who have good habits but average performance vs ➡ Students who have high performance regardless of habits

We would need your actual loading values to confirm—but this is the standard pattern for education datasets.

In [28]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------------------
# 1. Load the CSV safely
# -------------------------------
file_path = "datasets/enhanced_student_habits_performance_dataset.csv"

if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print("File loaded successfully!")
else:
    raise FileNotFoundError(f"The file '{file_path}' does not exist in {os.getcwd()}")

# -------------------------------
# 2. Clean column names: lowercase, replace spaces with underscores
# -------------------------------
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
print("Columns after cleaning:", df.columns)

# -------------------------------
# 3. Select the 3 desired factors
# -------------------------------
numeric_df = df[['study_hours_per_day', 'social_media_hours', 'previous_gpa']]

# -------------------------------
# 4. Sample 1000 rows
# -------------------------------
numeric_df = numeric_df.sample(n=1000, random_state=42)

# -------------------------------
# 5. Standardize
# -------------------------------
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# -------------------------------
# 6. PCA
# -------------------------------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(scaled_data)

pca_df = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])

# -------------------------------
# 7. PCA Biplot
# -------------------------------
plt.figure(figsize=(12, 8))

# Scatter plot for 1000 samples
sns.scatterplot(x='PC1', y='PC2', data=pca_df, s=80, color='skyblue', alpha=0.7)

# Feature arrows
for i, feature in enumerate(['study_hours_per_day', 'social_media_hours', 'previous_gpa']):
    plt.arrow(0, 0,
              pca.components_[0, i]*5,
              pca.components_[1, i]*5,
              color='black', alpha=0.8, head_width=0.2)
    plt.text(pca.components_[0, i]*5.2,
             pca.components_[1, i]*5.2,
             feature,
             fontsize=12, color='black')

plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)")
plt.title("PCA Biplot (1000 Samples: Study, Social Media & Previous GPA)")
plt.axhline(0, color='grey', lw=1)
plt.axvline(0, color='grey', lw=1)
plt.grid(True)
plt.tight_layout()
plt.show()
File loaded successfully!
Columns after cleaning: Index(['student_id', 'age', 'gender', 'major', 'study_hours_per_day',
       'social_media_hours', 'netflix_hours', 'part_time_job',
       'attendance_percentage', 'sleep_hours', 'diet_quality',
       'exercise_frequency', 'parental_education_level', 'internet_quality',
       'mental_health_rating', 'extracurricular_participation', 'previous_gpa',
       'semester', 'stress_level', 'dropout_risk', 'social_activity',
       'screen_time', 'study_environment', 'access_to_tutoring',
       'family_income_range', 'parental_support_level', 'motivation_level',
       'exam_anxiety_score', 'learning_style', 'time_management_score',
       'exam_score'],
      dtype='object')
No description has been provided for this image
In [ ]: