import numpy as np
import matplotlib.pyplot as plt
import sklearn
import pandas as pd

data = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv")

print("Columns available in your dataset:\n", data.columns)

# Choose FEATURES and LABEL correctly
# Example: label = gender, features = math score, reading score, writing score

y = data["parental level of education"].astype('category').cat.codes   # convert text → numbers
X = data[["math score", "reading score", "writing score"]].values

print(f"Your data shape (records, features): {X.shape}")

# Scatter plot: use first two features
plt.scatter(X[:,0], X[:,1], c=y)
plt.xlabel("math score")
plt.ylabel("reading score")
plt.title("Students Performance: Two Features")
plt.colorbar(label="parental level of education")
plt.show()

# Standardize
X = X - np.mean(X, axis=0)
std = np.std(X, axis=0)
Xscale = X / np.where(std > 0, std, 1)

# PCA 3 components (only 3 features exist)
pca = sklearn.decomposition.PCA(n_components=3)
Xpca = pca.fit_transform(Xscale)

plt.plot(pca.explained_variance_, 'o')
plt.xlabel("component")
plt.ylabel("explained variance")
plt.show()

# PCA scatter
plt.scatter(Xpca[:,0], Xpca[:,1], c=y)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA of Student Performance")
plt.colorbar(label="parental level of education")
plt.show()

Columns available in your dataset:
 Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')
Your data shape (records, features): (1000, 3)

Transform¶

Principal Components Analysis (PCA)¶