Tranformation¶
In [ ]:
## Principal Components Analysis (PCA)
- find a linear transform to principal components that minimize correlation and maximize explained variance
- used for reducing dimensionality, finding features, making low-dimensional plots of high-dimensional data
- based on diagonalizing the covariance matrix
- [PCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html)
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
np.set_printoptions(precision=2)
#
# load your own data
#
data = pd.read_csv("~/work/rinchen-khandu/datasets/student_depression_dataset.csv")
# select numerical columns only
numerical_cols = data.select_dtypes(include=[np.number]).columns
X = data[numerical_cols].dropna().values
print(f"Data shape (records, features): {X.shape}")
#
# optional: plot vs first two columns
#
plt.scatter(X[:, 0], X[:, 1], c='blue', alpha=0.6)
plt.xlabel(numerical_cols[0])
plt.ylabel(numerical_cols[1])
plt.title(f"{numerical_cols[0]} vs {numerical_cols[1]}")
plt.show()
#
# standardize data (zero mean, unit variance)
#
X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0)
Xscale = (X - X_mean) / np.where(X_std > 0, X_std, 1)
print(f"Standardized data mean: {np.mean(Xscale):.2f}, variance: {np.var(Xscale):.2f}")
#
# do PCA
#
n_components = min(50, Xshape := Xscale.shape[1]) # max 50 or # of features
pca = PCA(n_components=n_components)
pca.fit(Xscale)
Xpca = pca.transform(Xscale)
#
# plot explained variance
#
plt.plot(pca.explained_variance_, 'o-')
plt.xlabel('PCA component')
plt.ylabel('Explained variance')
plt.title('PCA Explained Variance')
plt.show()
#
# plot first two PCA components
#
plt.scatter(Xpca[:, 0], Xpca[:, 1], c='green', s=20, alpha=0.6)
plt.xlabel("Principal component 1")
plt.ylabel("Principal component 2")
plt.title("Data projected on first two PCA components")
plt.show()
Data shape (records, features): (27901, 9)
Standardized data mean: 0.00, variance: 1.00
Time Series¶
- data as a function of time
- example: Dual-Tone Multi-Frequency (DTMF) phone signaling example
In [ ]:
import numpy as np
import pandas as pd
from IPython.display import Audio, display
#
# Load your numeric data
#
data_csv = pd.read_csv("~/work/rinchen-khandu/datasets/student_depression_dataset.csv")
# Example: use 'Depression' column as numeric sequence
# Map each value to a digit 0-9
values = data_csv["Depression"].dropna().values
# normalize values to 0-9 and convert to integers
digits = [str(int(v) % 10) for v in values]
print("Digits to generate tones:", digits[:20], "...") # preview first 20 digits
#
# DTMF tone functions
#
rate = 44100
def DTMF_tone(digit, duration, rate, amplitude=0.5):
frequencies = {
'1':(697,1209),'2':(697,1336),'3':(697,1477),
'4':(770,1209),'5':(770,1336),'6':(770,1477),
'7':(852,1209),'8':(852,1336),'9':(852,1477),
'*':(941,1209),'0':(941,1336),'#':(941,1477)}
low, high = frequencies[digit]
t = np.linspace(0, duration, int(rate*duration), endpoint=False)
tone = amplitude*(np.sin(2*np.pi*low*t) + np.sin(2*np.pi*high*t))
return tone
def generate_DTMF(digits, duration=0.2, pause=0.1, rate=44100, amplitude=0.5):
data = np.array([], dtype=np.float32)
for digit in digits:
tone = DTMF_tone(digit, duration, rate, amplitude)
silence = np.zeros(int(rate*pause))
data = np.concatenate((data, tone, silence))
return data
#
# Generate and play DTMF audio
#
audio_data = generate_DTMF(digits, rate=rate)
# normalize to 16-bit PCM
audio_data = audio_data / np.max(np.abs(audio_data)) * (2**15 - 1)
audio_data = audio_data.astype(np.int16)
display(Audio(audio_data, rate=rate))
Compressed Sensing¶
- use priors on data to reduce requirements below conventional limits
- $L^2$ norm is isotropic, $L^1$ norm is maximized along axes (vectors are sparse)
- Total Variation (TV): smooth regions with sharp boundaries
- examples
- X-ray computed tomography
- cryo electron microscopy
- sampling below the Nyquist limit
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#
# ---------- PARAMETERS ----------
#
fsample = 1000 # sample rate (arbitrary for your data)
fraction = 0.05 # fraction of random samples
np.random.seed(50)
#
# ---------- LOAD YOUR DATA ----------
#
data = pd.read_csv("~/work/rinchen-khandu/datasets/student_depression_dataset.csv")
signal = data["Depression"].dropna().values # or 'Age' column
signal = signal - np.mean(signal) # zero mean
n = len(signal)
time = np.linspace(0, 1, n) # normalized time axis
#
# ---------- PLOT ORIGINAL DATA ----------
#
plt.figure()
plt.plot(time, signal, label="Original Data")
plt.xlabel("Normalized Time")
plt.ylabel("Depression")
plt.title("Original Student Depression Data")
plt.show()
#
# ---------- DISCRETE COSINE TRANSFORM ----------
#
dct = np.zeros((n, n))
for i in range(n):
dct[:, i] = np.sqrt(2 / n) * np.cos(np.pi * (2*i+1) * np.arange(n) / (2*n))
dct[0, :] /= np.sqrt(2)
trans = dct @ signal
freq = np.arange(n) / 2 # approximate frequency axis
plt.figure()
plt.plot(freq, trans)
plt.xlabel("Frequency")
plt.ylabel("DCT Coefficient")
plt.title("Discrete Cosine Transform")
plt.show()
#
# ---------- INVERSE DCT ----------
#
inv = np.transpose(dct) @ trans
plt.figure()
plt.plot(time, inv)
plt.xlabel("Normalized Time")
plt.ylabel("Depression")
plt.title("Inverse Discrete Cosine Transform")
plt.show()
#
# ---------- RANDOM SAMPLING ----------
#
nrandom = int(n*fraction)
index = (np.random.rand(nrandom)*n).astype(int)
random_samples = signal[index]
plt.figure()
plt.plot(time, signal, label="Signal")
plt.scatter(time[index], random_samples, c='r', label="Random Samples")
plt.xlabel("Normalized Time")
plt.ylabel("Depression")
plt.title("Random Samples from Data")
plt.legend()
plt.show()
#
# ---------- GRADIENT MINIMIZATION FUNCTION ----------
#
def gradmin(f, df, start, scale, steps):
x = np.copy(start)
for _ in range(steps):
x -= scale * df(x)
return x
#
# minimize error
#
def df_l2(x):
global random_samples, index, dct
inv = np.transpose(dct) @ x
grad = np.zeros(n)
for i in range(n):
grad[i] = np.sum(2*(inv[index]-random_samples)*dct[i,index])
return grad
def f_l2(x):
global random_samples, index, dct
inv = np.transpose(dct) @ x
return np.sum((inv[index]-random_samples)**2)
start = np.random.rand(n)
fit_l2 = gradmin(f_l2, df_l2, start, scale=0.1, steps=100)
plt.figure()
plt.plot(freq, fit_l2)
plt.xlabel("Frequency")
plt.ylabel("Coefficient")
plt.title("Minimize DCT Error")
plt.show()
#
# minimize error + L2 regularization
#
def df_l2reg(x):
global random_samples, index, dct
inv = np.transpose(dct) @ x
grad = np.zeros(n)
for i in range(n):
grad[i] = np.sum(2*(inv[index]-random_samples)*dct[i,index]) + 2*x[i]
return grad
def f_l2reg(x):
global random_samples, index, dct
inv = np.transpose(dct) @ x
return np.sum((inv[index]-random_samples)**2) + np.sum(x**2)
start = np.random.rand(n)
fit_l2reg = gradmin(f_l2reg, df_l2reg, start, scale=0.1, steps=100)
plt.figure()
plt.plot(freq, fit_l2reg)
plt.xlabel("Frequency")
plt.ylabel("Coefficient")
plt.title("Minimize DCT Error + L2 Norm")
plt.show()
#
# minimize error + L1 regularization
#
def df_l1reg(x):
global random_samples, index, dct
inv = np.transpose(dct) @ x
grad = np.zeros(n)
for i in range(n):
grad[i] = np.sum(2*(inv[index]-random_samples)*dct[i,index]) + np.sign(x[i])
return grad
def f_l1reg(x):
global random_samples, index, dct
inv = np.transpose(dct) @ x
return np.sum((inv[index]-random_samples)**2) + np.sum(np.abs(x))
start = np.random.rand(n)
fit_l1reg = gradmin(f_l1reg, df_l1reg, start, scale=0.1, steps=100)
plt.figure()
plt.plot(freq, fit_l1reg)
plt.xlabel("Frequency")
plt.ylabel("Coefficient")
plt.title("Minimize DCT Error + L1 Norm")
plt.show()
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# ---------- LOAD YOUR DATA ----------
data = pd.read_csv("~/work/rinchen-khandu/datasets/student_depression_dataset.csv")
x_col = "Age"
y_col = "Sleeping Duration"
x = data[x_col].values
y = data[y_col].values
# ---------- CREATE 2D SIGNAL WITH HISTOGRAM ----------
n_bins_x = 50 # number of bins along Age
n_bins_y = 50 # number of bins along Depression
signal_2d, xedges, yedges = np.histogram2d(x, y, bins=[n_bins_x, n_bins_y])
# Create grid for plotting
X_grid, Y_grid = np.meshgrid(xedges[:-1], yedges[:-1], indexing='ij')
plt.figure()
plt.imshow(signal_2d.T, origin='lower', aspect='auto',
extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])
plt.colorbar(label="Count")
plt.xlabel(x_col)
plt.ylabel(y_col)
plt.title("2D Signal from Age and Depression")
plt.show()
# ---------- 2D DCT ----------
def dct2(signal):
n, m = signal.shape
dct_n = np.zeros((n, n))
dct_m = np.zeros((m, m))
for i in range(n):
dct_n[:, i] = np.sqrt(2/n) * np.cos(np.pi*(2*i+1) * np.arange(n)/(2*n))
dct_n[0, :] /= np.sqrt(2)
for j in range(m):
dct_m[:, j] = np.sqrt(2/m) * np.cos(np.pi*(2*j+1) * np.arange(m)/(2*m))
dct_m[0, :] /= np.sqrt(2)
return dct_n @ signal @ dct_m.T
def idct2(dct_signal):
n, m = dct_signal.shape
dct_n = np.zeros((n, n))
dct_m = np.zeros((m, m))
for i in range(n):
dct_n[:, i] = np.sqrt(2/n) * np.cos(np.pi*(2*i+1) * np.arange(n)/(2*n))
dct_n[0, :] /= np.sqrt(2)
for j in range(m):
dct_m[:, j] = np.sqrt(2/m) * np.cos(np.pi*(2*j+1) * np.arange(m)/(2*m))
dct_m[0, :] /= np.sqrt(2)
return dct_n.T @ dct_signal @ dct_m
dct_signal = dct2(signal_2d)
plt.figure()
plt.imshow(dct_signal.T, origin='lower', aspect='auto')
plt.colorbar(label="DCT Coefficient")
plt.title("2D DCT")
plt.show()
# ---------- INVERSE 2D DCT ----------
inv_signal = idct2(dct_signal)
plt.figure()
plt.imshow(inv_signal.T, origin='lower', aspect='auto')
plt.colorbar(label="Count")
plt.title("Inverse 2D DCT")
plt.show()
In [ ]: