[Rinchen Khandu] - Fab Futures - Data Science
Home About

Tranformation¶

In [ ]:
## Principal Components Analysis (PCA)

- find a linear transform to principal components that minimize correlation and maximize explained variance
- used for reducing dimensionality, finding features, making low-dimensional plots of high-dimensional data
- based on diagonalizing the covariance matrix
- [PCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html)
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
np.set_printoptions(precision=2)

#
# load your own data
#
data = pd.read_csv("~/work/rinchen-khandu/datasets/student_depression_dataset.csv")

# select numerical columns only
numerical_cols = data.select_dtypes(include=[np.number]).columns
X = data[numerical_cols].dropna().values

print(f"Data shape (records, features): {X.shape}")

#
# optional: plot vs first two columns
#
plt.scatter(X[:, 0], X[:, 1], c='blue', alpha=0.6)
plt.xlabel(numerical_cols[0])
plt.ylabel(numerical_cols[1])
plt.title(f"{numerical_cols[0]} vs {numerical_cols[1]}")
plt.show()

#
# standardize data (zero mean, unit variance)
#
X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0)
Xscale = (X - X_mean) / np.where(X_std > 0, X_std, 1)

print(f"Standardized data mean: {np.mean(Xscale):.2f}, variance: {np.var(Xscale):.2f}")

#
# do PCA
#
n_components = min(50, Xshape := Xscale.shape[1])  # max 50 or # of features
pca = PCA(n_components=n_components)
pca.fit(Xscale)
Xpca = pca.transform(Xscale)

#
# plot explained variance
#
plt.plot(pca.explained_variance_, 'o-')
plt.xlabel('PCA component')
plt.ylabel('Explained variance')
plt.title('PCA Explained Variance')
plt.show()

#
# plot first two PCA components
#
plt.scatter(Xpca[:, 0], Xpca[:, 1], c='green', s=20, alpha=0.6)
plt.xlabel("Principal component 1")
plt.ylabel("Principal component 2")
plt.title("Data projected on first two PCA components")
plt.show()
Data shape (records, features): (27901, 9)
No description has been provided for this image
Standardized data mean: 0.00, variance: 1.00
No description has been provided for this image
No description has been provided for this image

Independent Components Analysis (ICA)¶

  • used for blind source separation (the cocktail party problem)
  • separates unknown additively mixed components by maximizing their independence
  • FastICA

Time Series¶

  • data as a function of time
  • example: Dual-Tone Multi-Frequency (DTMF) phone signaling example
In [ ]:
import numpy as np
import pandas as pd
from IPython.display import Audio, display

#
# Load your numeric data
#
data_csv = pd.read_csv("~/work/rinchen-khandu/datasets/student_depression_dataset.csv")

# Example: use 'Depression' column as numeric sequence
# Map each value to a digit 0-9
values = data_csv["Depression"].dropna().values

# normalize values to 0-9 and convert to integers
digits = [str(int(v) % 10) for v in values]

print("Digits to generate tones:", digits[:20], "...")  # preview first 20 digits

#
# DTMF tone functions
#
rate = 44100

def DTMF_tone(digit, duration, rate, amplitude=0.5):
    frequencies = {
        '1':(697,1209),'2':(697,1336),'3':(697,1477),
        '4':(770,1209),'5':(770,1336),'6':(770,1477),
        '7':(852,1209),'8':(852,1336),'9':(852,1477),
        '*':(941,1209),'0':(941,1336),'#':(941,1477)}
    low, high = frequencies[digit]
    t = np.linspace(0, duration, int(rate*duration), endpoint=False)
    tone = amplitude*(np.sin(2*np.pi*low*t) + np.sin(2*np.pi*high*t))
    return tone

def generate_DTMF(digits, duration=0.2, pause=0.1, rate=44100, amplitude=0.5):
    data = np.array([], dtype=np.float32)
    for digit in digits:
        tone = DTMF_tone(digit, duration, rate, amplitude)
        silence = np.zeros(int(rate*pause))
        data = np.concatenate((data, tone, silence))
    return data

#
# Generate and play DTMF audio
#
audio_data = generate_DTMF(digits, rate=rate)

# normalize to 16-bit PCM
audio_data = audio_data / np.max(np.abs(audio_data)) * (2**15 - 1)
audio_data = audio_data.astype(np.int16)

display(Audio(audio_data, rate=rate))

Compressed Sensing¶

  • use priors on data to reduce requirements below conventional limits
    • $L^2$ norm is isotropic, $L^1$ norm is maximized along axes (vectors are sparse)
    • Total Variation (TV): smooth regions with sharp boundaries
  • examples
    • X-ray computed tomography
    • cryo electron microscopy
    • sampling below the Nyquist limit
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#
# ---------- PARAMETERS ----------
#
fsample = 1000      # sample rate (arbitrary for your data)
fraction = 0.05     # fraction of random samples
np.random.seed(50)

#
# ---------- LOAD YOUR DATA ----------
#
data = pd.read_csv("~/work/rinchen-khandu/datasets/student_depression_dataset.csv")
signal = data["Depression"].dropna().values   # or 'Age' column
signal = signal - np.mean(signal)             # zero mean
n = len(signal)
time = np.linspace(0, 1, n)                  # normalized time axis

#
# ---------- PLOT ORIGINAL DATA ----------
#
plt.figure()
plt.plot(time, signal, label="Original Data")
plt.xlabel("Normalized Time")
plt.ylabel("Depression")
plt.title("Original Student Depression Data")
plt.show()

#
# ---------- DISCRETE COSINE TRANSFORM ----------
#
dct = np.zeros((n, n))
for i in range(n):
    dct[:, i] = np.sqrt(2 / n) * np.cos(np.pi * (2*i+1) * np.arange(n) / (2*n))
dct[0, :] /= np.sqrt(2)
trans = dct @ signal
freq = np.arange(n) / 2  # approximate frequency axis

plt.figure()
plt.plot(freq, trans)
plt.xlabel("Frequency")
plt.ylabel("DCT Coefficient")
plt.title("Discrete Cosine Transform")
plt.show()

#
# ---------- INVERSE DCT ----------
#
inv = np.transpose(dct) @ trans
plt.figure()
plt.plot(time, inv)
plt.xlabel("Normalized Time")
plt.ylabel("Depression")
plt.title("Inverse Discrete Cosine Transform")
plt.show()

#
# ---------- RANDOM SAMPLING ----------
#
nrandom = int(n*fraction)
index = (np.random.rand(nrandom)*n).astype(int)
random_samples = signal[index]

plt.figure()
plt.plot(time, signal, label="Signal")
plt.scatter(time[index], random_samples, c='r', label="Random Samples")
plt.xlabel("Normalized Time")
plt.ylabel("Depression")
plt.title("Random Samples from Data")
plt.legend()
plt.show()

#
# ---------- GRADIENT MINIMIZATION FUNCTION ----------
#
def gradmin(f, df, start, scale, steps):
    x = np.copy(start)
    for _ in range(steps):
        x -= scale * df(x)
    return x

#
# minimize error
#
def df_l2(x):
    global random_samples, index, dct
    inv = np.transpose(dct) @ x
    grad = np.zeros(n)
    for i in range(n):
        grad[i] = np.sum(2*(inv[index]-random_samples)*dct[i,index])
    return grad

def f_l2(x):
    global random_samples, index, dct
    inv = np.transpose(dct) @ x
    return np.sum((inv[index]-random_samples)**2)

start = np.random.rand(n)
fit_l2 = gradmin(f_l2, df_l2, start, scale=0.1, steps=100)

plt.figure()
plt.plot(freq, fit_l2)
plt.xlabel("Frequency")
plt.ylabel("Coefficient")
plt.title("Minimize DCT Error")
plt.show()

#
# minimize error + L2 regularization
#
def df_l2reg(x):
    global random_samples, index, dct
    inv = np.transpose(dct) @ x
    grad = np.zeros(n)
    for i in range(n):
        grad[i] = np.sum(2*(inv[index]-random_samples)*dct[i,index]) + 2*x[i]
    return grad

def f_l2reg(x):
    global random_samples, index, dct
    inv = np.transpose(dct) @ x
    return np.sum((inv[index]-random_samples)**2) + np.sum(x**2)

start = np.random.rand(n)
fit_l2reg = gradmin(f_l2reg, df_l2reg, start, scale=0.1, steps=100)

plt.figure()
plt.plot(freq, fit_l2reg)
plt.xlabel("Frequency")
plt.ylabel("Coefficient")
plt.title("Minimize DCT Error + L2 Norm")
plt.show()

#
# minimize error + L1 regularization
#
def df_l1reg(x):
    global random_samples, index, dct
    inv = np.transpose(dct) @ x
    grad = np.zeros(n)
    for i in range(n):
        grad[i] = np.sum(2*(inv[index]-random_samples)*dct[i,index]) + np.sign(x[i])
    return grad

def f_l1reg(x):
    global random_samples, index, dct
    inv = np.transpose(dct) @ x
    return np.sum((inv[index]-random_samples)**2) + np.sum(np.abs(x))

start = np.random.rand(n)
fit_l1reg = gradmin(f_l1reg, df_l1reg, start, scale=0.1, steps=100)

plt.figure()
plt.plot(freq, fit_l1reg)
plt.xlabel("Frequency")
plt.ylabel("Coefficient")
plt.title("Minimize DCT Error + L1 Norm")
plt.show()
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# ---------- LOAD YOUR DATA ----------
data = pd.read_csv("~/work/rinchen-khandu/datasets/student_depression_dataset.csv")
x_col = "Age"
y_col = "Sleeping Duration"

x = data[x_col].values
y = data[y_col].values

# ---------- CREATE 2D SIGNAL WITH HISTOGRAM ----------
n_bins_x = 50  # number of bins along Age
n_bins_y = 50  # number of bins along Depression

signal_2d, xedges, yedges = np.histogram2d(x, y, bins=[n_bins_x, n_bins_y])

# Create grid for plotting
X_grid, Y_grid = np.meshgrid(xedges[:-1], yedges[:-1], indexing='ij')

plt.figure()
plt.imshow(signal_2d.T, origin='lower', aspect='auto',
           extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])
plt.colorbar(label="Count")
plt.xlabel(x_col)
plt.ylabel(y_col)
plt.title("2D Signal from Age and Depression")
plt.show()

# ---------- 2D DCT ----------
def dct2(signal):
    n, m = signal.shape
    dct_n = np.zeros((n, n))
    dct_m = np.zeros((m, m))
    
    for i in range(n):
        dct_n[:, i] = np.sqrt(2/n) * np.cos(np.pi*(2*i+1) * np.arange(n)/(2*n))
    dct_n[0, :] /= np.sqrt(2)
    
    for j in range(m):
        dct_m[:, j] = np.sqrt(2/m) * np.cos(np.pi*(2*j+1) * np.arange(m)/(2*m))
    dct_m[0, :] /= np.sqrt(2)
    
    return dct_n @ signal @ dct_m.T

def idct2(dct_signal):
    n, m = dct_signal.shape
    dct_n = np.zeros((n, n))
    dct_m = np.zeros((m, m))
    
    for i in range(n):
        dct_n[:, i] = np.sqrt(2/n) * np.cos(np.pi*(2*i+1) * np.arange(n)/(2*n))
    dct_n[0, :] /= np.sqrt(2)
    
    for j in range(m):
        dct_m[:, j] = np.sqrt(2/m) * np.cos(np.pi*(2*j+1) * np.arange(m)/(2*m))
    dct_m[0, :] /= np.sqrt(2)
    
    return dct_n.T @ dct_signal @ dct_m

dct_signal = dct2(signal_2d)

plt.figure()
plt.imshow(dct_signal.T, origin='lower', aspect='auto')
plt.colorbar(label="DCT Coefficient")
plt.title("2D DCT")
plt.show()

# ---------- INVERSE 2D DCT ----------
inv_signal = idct2(dct_signal)

plt.figure()
plt.imshow(inv_signal.T, origin='lower', aspect='auto')
plt.colorbar(label="Count")
plt.title("Inverse 2D DCT")
plt.show()
In [ ]: