[Your-Name-Here] - Fab Futures - Data Science
Home About

Principal Components Analysis (PCA)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

np.set_printoptions(precision=1)

#
# Load mental health dataset
#
df = pd.read_csv("../kinley-dema/datasets/mental_health.csv")

print(f"Data shape (records, features): {df.shape}")

# Select numeric columns
num_cols = df.select_dtypes(include='number').columns
X = df[num_cols].to_numpy()
print(f"Numeric data shape: {X.shape}")

#
# Plot two numeric columns
#
plt.scatter(X[:,0], X[:,1])
plt.xlabel(num_cols[0])
plt.ylabel(num_cols[1])
plt.title("Dataset: Scatter of first two numeric features")
plt.show()

#
# Standardize data
#
print(f"Data mean: {np.mean(X):.2f}, variance: {np.var(X):.2f}")

X_centered = X - np.mean(X, axis=0)
std = np.std(X_centered, axis=0)
Xscale = X_centered / np.where(std > 0, std, 1)

print(f"Standardized mean: {np.mean(Xscale):.2f}, variance: {np.var(Xscale):.2f}")

#
# PCA
#
pca = sklearn.decomposition.PCA(n_components=5)
pca.fit(Xscale)
Xpca = pca.transform(Xscale)

plt.plot(pca.explained_variance_, 'o')
plt.xlabel("PCA component")
plt.ylabel("Explained variance")
plt.title("PCA on Mental Health Dataset")
plt.show()

#
# Plot PCA components
#
plt.scatter(Xpca[:,0], Xpca[:,1])
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("First Two PCA Components")
plt.show()
Data shape (records, features): (500, 10)
Numeric data shape: (500, 7)
No description has been provided for this image
Data mean: 9.34, variance: 113.12
Standardized mean: 0.00, variance: 1.00
No description has been provided for this image
No description has been provided for this image

Filters

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import butter, filtfilt

# Select first numeric column to use as "signal"
num_cols = df.select_dtypes(include='number').columns
signal = df[num_cols[0]].to_numpy()

print(f"Using column: {num_cols[0]}")
print(f"Signal length: {len(signal)}")

#
# Plot raw time series
#
plt.plot(signal)
plt.title("Raw Time Series (from dataset)")
plt.xlabel("Index")
plt.ylabel(num_cols[0])
plt.show()

#
# FFT of raw signal
#
rate = 20000  # sampling rate assumption (same as your original code)
fft = np.fft.rfft(signal)
frequencies = np.fft.rfftfreq(len(signal), d=1/rate)

plt.plot(frequencies, np.abs(fft))
plt.title("Raw Time Series FFT")
plt.xlabel("frequency (Hz)")
plt.ylabel("magnitude")
plt.xscale('log')
plt.yscale('log')
plt.xlim(10, 5000)
plt.show()

#
# Filtering parameters
#
order = 5

# ----------------------------- LOW PASS -----------------------------
low = 500
b, a = butter(order, low, btype='low', fs=rate)
low_filtered = filtfilt(b, a, signal)

plt.plot(low_filtered)
plt.title("Low-pass Filtered Signal")
plt.xlabel("Index")
plt.ylabel(num_cols[0])
plt.show()

fft_low = np.fft.rfft(low_filtered)
plt.plot(frequencies, np.abs(fft_low))
plt.title("Low-pass Filter FFT")
plt.xlabel("frequency (Hz)")
plt.ylabel("magnitude")
plt.xscale('log')
plt.yscale('log')
plt.xlim(10, 5000)
plt.show()

# ----------------------------- HIGH PASS -----------------------------
high = 500
b, a = butter(order, high, btype='high', fs=rate)
high_filtered = filtfilt(b, a, signal)

plt.plot(high_filtered)
plt.title("High-pass Filtered Signal")
plt.xlabel("Index")
plt.ylabel(num_cols[0])
plt.show()

fft_high = np.fft.rfft(high_filtered)
plt.plot(frequencies, np.abs(fft_high))
plt.title("High-pass Filter FFT")
plt.xlabel("frequency (Hz)")
plt.ylabel("magnitude")
plt.xscale('log')
plt.yscale('log')
plt.xlim(10, 5000)
plt.show()

# ----------------------------- BAND PASS -----------------------------
low = 40
high = 750
b, a = butter(order, [low, high], btype='band', fs=rate)
band_filtered = filtfilt(b, a, signal)

plt.plot(band_filtered)
plt.title("Band-pass Filtered Signal")
plt.xlabel("Index")
plt.ylabel(num_cols[0])
plt.show()

fft_band = np.fft.rfft(band_filtered)
plt.plot(frequencies, np.abs(fft_band))
plt.title("Band-pass Filter FFT")
plt.xlabel("frequency (Hz)")
plt.ylabel("magnitude")
plt.xscale('log')
plt.yscale('log')
plt.xlim(10, 5000)
plt.show()
Using column: Age
Signal length: 500
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Compressed Sensing

In [6]:
# use the first numeric column as the “signal”
num_cols = df.select_dtypes(include='number').columns
signal_col = num_cols[0]
data = df[signal_col].to_numpy()

print(f"Using numeric column: {signal_col}")
print(f"Length of signal: {len(data)}")

#
# parameters
#
fsample = 10000         # pretend sampling rate (same as original code)
tmax = len(data)/fsample
fraction = 0.05
np.random.seed(50)

#
# build time axis
#
time = np.linspace(0, tmax, len(data))
n = len(data)

#
# plot raw data as “time series”
#
plt.figure()
plt.plot(time, data)
plt.title(f"Time Series from Dataset Column: {signal_col}")
plt.xlabel("time (s)")
plt.xlim(0, time.min()+0.02)
plt.show()

#
# DCT matrix
#
dct = np.zeros((n, n))
for i in range(n):
    dct[:, i] = np.sqrt(2/n) * np.cos(np.pi*(2*i+1)*np.arange(n)/(2*n))
dct[0, :] *= 1/np.sqrt(2)

#
# forward DCT transform
#
trans = dct @ data
freq = np.arange(n) / (2*tmax)

plt.figure()
plt.plot(freq, trans)
plt.title("Discrete Cosine Transform (Dataset)")
plt.xlabel("frequency")
plt.xlim(0, np.max(freq)*0.3)
plt.show()

#
# inverse DCT
#
inv = np.transpose(dct) @ trans

plt.figure()
plt.plot(time, inv)
plt.title("Inverse DCT Reconstruction")
plt.xlabel("time (s)")
plt.xlim(0, time.min()+0.02)
plt.show()

#
# random sampling
#
nrandom = int(n*fraction)
index = (np.random.rand(nrandom) * n).astype(int)
random = data[index]

plt.figure()
plt.plot(time, data)
plt.scatter(time[index], random, c='r')
plt.title("Random Samples from Signal")
plt.xlabel("time (s)")
plt.xlim(0, time.min()+0.02)
plt.show()

#
# gradient descent helper
#
def gradmin(f, df, start, scale, steps):
    x = np.copy(start)
    for i in range(steps):
        x -= scale * df(x)
    return x

#
# minimize squared error (no regularization)
#
def df0(x):
    invx = np.transpose(dct) @ x
    grad = np.zeros(n)
    for i in range(n):
        grad[i] = np.sum(2*(invx[index] - random) * dct[i, index])
    return grad

def f0(x):
    invx = np.transpose(dct) @ x
    return np.sum((invx[index] - random)**2)

start = np.random.rand(n)
scale = 0.1
steps = 100

fit = gradmin(f0, df0, start, scale, steps)

plt.figure()
plt.plot(freq, fit)
plt.title("Minimize DCT Error")
plt.xlabel("frequency")
plt.show()

#
# minimize error + L2
#
def dfL2(x):
    invx = np.transpose(dct) @ x
    grad = np.zeros(n)
    for i in range(n):
        grad[i] = np.sum(2*(invx[index] - random) * dct[i, index]) + 2*x[i]
    return grad

def fL2(x):
    invx = np.transpose(dct) @ x
    return np.sum((invx[index] - random)**2) + np.sum(x**2)

start = np.random.rand(n)
fit = gradmin(fL2, dfL2, start, scale, steps)

plt.figure()
plt.plot(freq, fit)
plt.title("Minimize DCT Error + L2 Regularization")
plt.xlabel("frequency")
plt.show()

#
# minimize error + L1
#
def dfL1(x):
    invx = np.transpose(dct) @ x
    grad = np.zeros(n)
    for i in range(n):
        grad[i] = np.sum(2*(invx[index] - random) * dct[i, index]) + np.sign(x[i])
    return grad

def fL1(x):
    invx = np.transpose(dct) @ x
    return np.sum((invx[index] - random)**2) + np.sum(np.abs(x))

start = np.random.rand(n)
fit = gradmin(fL1, dfL1, start, scale, steps)

plt.figure()
plt.plot(freq, fit)
plt.title("Minimize DCT Error + L1 Regularization")
plt.xlabel("frequency")
plt.show()
Using numeric column: Age
Length of signal: 500
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]: