import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv('datasets/StudentsPerformance.csv')

# Select numeric columns only
numeric_df = df.select_dtypes(include='number')

# Standardize numeric data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

# Apply PCA (2 components)
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

# Add PCA results back to a dataframe
pca_df = pd.DataFrame({
    'PC1': pca_result[:, 0],
    'PC2': pca_result[:, 1]
})

print(pca_df.head())

# Plot PCA scatter
plt.figure(figsize=(6, 5))
plt.scatter(pca_df['PC1'], pca_df['PC2'])
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA Scatter Plot')
plt.show()

        PC1       PC2
0  0.560514  0.088285
1  1.719201 -0.910745
2  2.883135 -0.021999
3 -2.119921 -0.074994
4  0.988094  0.131914

Principal Component Analysis (PCA)¶