Principal Components Analysis (PCA)
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
np.set_printoptions(precision=1)
#
# Load mental health dataset
#
df = pd.read_csv("../kinley-dema/datasets/mental_health.csv")
print(f"Data shape (records, features): {df.shape}")
# Select numeric columns
num_cols = df.select_dtypes(include='number').columns
X = df[num_cols].to_numpy()
print(f"Numeric data shape: {X.shape}")
#
# Plot two numeric columns
#
plt.scatter(X[:,0], X[:,1])
plt.xlabel(num_cols[0])
plt.ylabel(num_cols[1])
plt.title("Dataset: Scatter of first two numeric features")
plt.show()
#
# Standardize data
#
print(f"Data mean: {np.mean(X):.2f}, variance: {np.var(X):.2f}")
X_centered = X - np.mean(X, axis=0)
std = np.std(X_centered, axis=0)
Xscale = X_centered / np.where(std > 0, std, 1)
print(f"Standardized mean: {np.mean(Xscale):.2f}, variance: {np.var(Xscale):.2f}")
#
# PCA
#
pca = sklearn.decomposition.PCA(n_components=5)
pca.fit(Xscale)
Xpca = pca.transform(Xscale)
plt.plot(pca.explained_variance_, 'o')
plt.xlabel("PCA component")
plt.ylabel("Explained variance")
plt.title("PCA on Mental Health Dataset")
plt.show()
#
# Plot PCA components
#
plt.scatter(Xpca[:,0], Xpca[:,1])
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("First Two PCA Components")
plt.show()
Data shape (records, features): (500, 10) Numeric data shape: (500, 7)
Data mean: 9.34, variance: 113.12 Standardized mean: 0.00, variance: 1.00
Filters
In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import butter, filtfilt
# Select first numeric column to use as "signal"
num_cols = df.select_dtypes(include='number').columns
signal = df[num_cols[0]].to_numpy()
print(f"Using column: {num_cols[0]}")
print(f"Signal length: {len(signal)}")
#
# Plot raw time series
#
plt.plot(signal)
plt.title("Raw Time Series (from dataset)")
plt.xlabel("Index")
plt.ylabel(num_cols[0])
plt.show()
#
# FFT of raw signal
#
rate = 20000 # sampling rate assumption (same as your original code)
fft = np.fft.rfft(signal)
frequencies = np.fft.rfftfreq(len(signal), d=1/rate)
plt.plot(frequencies, np.abs(fft))
plt.title("Raw Time Series FFT")
plt.xlabel("frequency (Hz)")
plt.ylabel("magnitude")
plt.xscale('log')
plt.yscale('log')
plt.xlim(10, 5000)
plt.show()
#
# Filtering parameters
#
order = 5
# ----------------------------- LOW PASS -----------------------------
low = 500
b, a = butter(order, low, btype='low', fs=rate)
low_filtered = filtfilt(b, a, signal)
plt.plot(low_filtered)
plt.title("Low-pass Filtered Signal")
plt.xlabel("Index")
plt.ylabel(num_cols[0])
plt.show()
fft_low = np.fft.rfft(low_filtered)
plt.plot(frequencies, np.abs(fft_low))
plt.title("Low-pass Filter FFT")
plt.xlabel("frequency (Hz)")
plt.ylabel("magnitude")
plt.xscale('log')
plt.yscale('log')
plt.xlim(10, 5000)
plt.show()
# ----------------------------- HIGH PASS -----------------------------
high = 500
b, a = butter(order, high, btype='high', fs=rate)
high_filtered = filtfilt(b, a, signal)
plt.plot(high_filtered)
plt.title("High-pass Filtered Signal")
plt.xlabel("Index")
plt.ylabel(num_cols[0])
plt.show()
fft_high = np.fft.rfft(high_filtered)
plt.plot(frequencies, np.abs(fft_high))
plt.title("High-pass Filter FFT")
plt.xlabel("frequency (Hz)")
plt.ylabel("magnitude")
plt.xscale('log')
plt.yscale('log')
plt.xlim(10, 5000)
plt.show()
# ----------------------------- BAND PASS -----------------------------
low = 40
high = 750
b, a = butter(order, [low, high], btype='band', fs=rate)
band_filtered = filtfilt(b, a, signal)
plt.plot(band_filtered)
plt.title("Band-pass Filtered Signal")
plt.xlabel("Index")
plt.ylabel(num_cols[0])
plt.show()
fft_band = np.fft.rfft(band_filtered)
plt.plot(frequencies, np.abs(fft_band))
plt.title("Band-pass Filter FFT")
plt.xlabel("frequency (Hz)")
plt.ylabel("magnitude")
plt.xscale('log')
plt.yscale('log')
plt.xlim(10, 5000)
plt.show()
Using column: Age Signal length: 500
Compressed Sensing
In [6]:
# use the first numeric column as the “signal”
num_cols = df.select_dtypes(include='number').columns
signal_col = num_cols[0]
data = df[signal_col].to_numpy()
print(f"Using numeric column: {signal_col}")
print(f"Length of signal: {len(data)}")
#
# parameters
#
fsample = 10000 # pretend sampling rate (same as original code)
tmax = len(data)/fsample
fraction = 0.05
np.random.seed(50)
#
# build time axis
#
time = np.linspace(0, tmax, len(data))
n = len(data)
#
# plot raw data as “time series”
#
plt.figure()
plt.plot(time, data)
plt.title(f"Time Series from Dataset Column: {signal_col}")
plt.xlabel("time (s)")
plt.xlim(0, time.min()+0.02)
plt.show()
#
# DCT matrix
#
dct = np.zeros((n, n))
for i in range(n):
dct[:, i] = np.sqrt(2/n) * np.cos(np.pi*(2*i+1)*np.arange(n)/(2*n))
dct[0, :] *= 1/np.sqrt(2)
#
# forward DCT transform
#
trans = dct @ data
freq = np.arange(n) / (2*tmax)
plt.figure()
plt.plot(freq, trans)
plt.title("Discrete Cosine Transform (Dataset)")
plt.xlabel("frequency")
plt.xlim(0, np.max(freq)*0.3)
plt.show()
#
# inverse DCT
#
inv = np.transpose(dct) @ trans
plt.figure()
plt.plot(time, inv)
plt.title("Inverse DCT Reconstruction")
plt.xlabel("time (s)")
plt.xlim(0, time.min()+0.02)
plt.show()
#
# random sampling
#
nrandom = int(n*fraction)
index = (np.random.rand(nrandom) * n).astype(int)
random = data[index]
plt.figure()
plt.plot(time, data)
plt.scatter(time[index], random, c='r')
plt.title("Random Samples from Signal")
plt.xlabel("time (s)")
plt.xlim(0, time.min()+0.02)
plt.show()
#
# gradient descent helper
#
def gradmin(f, df, start, scale, steps):
x = np.copy(start)
for i in range(steps):
x -= scale * df(x)
return x
#
# minimize squared error (no regularization)
#
def df0(x):
invx = np.transpose(dct) @ x
grad = np.zeros(n)
for i in range(n):
grad[i] = np.sum(2*(invx[index] - random) * dct[i, index])
return grad
def f0(x):
invx = np.transpose(dct) @ x
return np.sum((invx[index] - random)**2)
start = np.random.rand(n)
scale = 0.1
steps = 100
fit = gradmin(f0, df0, start, scale, steps)
plt.figure()
plt.plot(freq, fit)
plt.title("Minimize DCT Error")
plt.xlabel("frequency")
plt.show()
#
# minimize error + L2
#
def dfL2(x):
invx = np.transpose(dct) @ x
grad = np.zeros(n)
for i in range(n):
grad[i] = np.sum(2*(invx[index] - random) * dct[i, index]) + 2*x[i]
return grad
def fL2(x):
invx = np.transpose(dct) @ x
return np.sum((invx[index] - random)**2) + np.sum(x**2)
start = np.random.rand(n)
fit = gradmin(fL2, dfL2, start, scale, steps)
plt.figure()
plt.plot(freq, fit)
plt.title("Minimize DCT Error + L2 Regularization")
plt.xlabel("frequency")
plt.show()
#
# minimize error + L1
#
def dfL1(x):
invx = np.transpose(dct) @ x
grad = np.zeros(n)
for i in range(n):
grad[i] = np.sum(2*(invx[index] - random) * dct[i, index]) + np.sign(x[i])
return grad
def fL1(x):
invx = np.transpose(dct) @ x
return np.sum((invx[index] - random)**2) + np.sum(np.abs(x))
start = np.random.rand(n)
fit = gradmin(fL1, dfL1, start, scale, steps)
plt.figure()
plt.plot(freq, fit)
plt.title("Minimize DCT Error + L1 Regularization")
plt.xlabel("frequency")
plt.show()
Using numeric column: Age Length of signal: 500
In [ ]: