Preprocessing¶
- standardization, sphering
- subtract mean and divide by standard deviation so that all axes are similarly scaled with a zero mean and unit variance
- prevents prioritizing the largest variables
In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
np.set_printoptions(precision=1)
#
# load MNIST data
#
X = np.load('datasets/MNIST/xtrain.npy')
y = np.load('datasets/MNIST/ytrain.npy')
print(f"MNIST data shape (records,pixels): {X.shape}")
#
# plot vs two pixel values
#
plt.scatter(X[:,200],X[:,400],c=y)
plt.xlabel("pixel 200")
plt.ylabel("pixel 400")
plt.title("MNIST vs pixels 200 and 400")
plt.colorbar(label="digit")
plt.show()
#
# standardize (zero mean, unit variance) to eliminate dependence on data scaling
#
print(f"data mean: {np.mean(X):.2f}, variance: {np.var(X):.2f}")
X = X-np.mean(X,axis=0)
std = np.std(X,axis=0)
Xscale = X/np.where(std > 0,std,1)
print(f"standardized data mean: {np.mean(Xscale):.2f}, variance: {np.var(Xscale):.2f}")
#
# do 50 component PCA
#
pca = sklearn.decomposition.PCA(n_components=50)
pca.fit(Xscale)
Xpca = pca.transform(Xscale)
plt.plot(pca.explained_variance_,'o')
plt.plot()
plt.xlabel('PCA component')
plt.ylabel('explained variance')
plt.title('MNIST PCA')
plt.show()
#
# plot vs first two PCA components
#
plt.scatter(Xpca[:,0],Xpca[:,1],c=y,s=3)
plt.xlabel("principal component 1")
plt.ylabel("principal component 2")
plt.title("MNIST vs two principal components")
plt.colorbar(label="digit")
plt.show()
MNIST data shape (records,pixels): (60000, 784)
data mean: 33.32, variance: 6172.85 standardized data mean: 0.00, variance: 0.91
Time Series¶
- data as a function of time
- example: Dual-Tone Multi-Frequency (DTMF) phone signaling example
In [2]:
import numpy as np
from IPython.display import Audio, display
digits = "1234567890"
rate = 44100
def DTMF_tone(digit,duration,rate,amplitude):
frequencies = {
'1':(697,1209),'2':(697,1336),'3':(697,1477),
'4':(770,1209),'5':(770,1336),'6':(770,1477),
'7':(852,1209),'8':(852,1336),'9':(852,1477),
'*':(941,1209),'0':(941,1336),'#':(941,1477)}
low,high = frequencies[digit]
t = np.linspace(0,duration,int(rate*duration),endpoint=False)
tone = amplitude*(np.sin(2*np.pi*low*t)+np.sin(2*np.pi*high*t))
return tone
def generate_DTMF(digits,duration=0.2,silence=0.1,rate=44100,amplitude=0.5):
data = np.array([])
for digit in digits:
tone = DTMF_tone(digit,duration,rate,amplitude)
silence = np.zeros(int(rate*duration))
data = np.concatenate((data,tone,silence))
return data
data = generate_DTMF(digits,rate=rate)
data = data/np.max(np.abs(data))*(2**15-1)
data = data.astype(np.int16)
display(Audio(data,rate=rate))
sonification¶
- listen to data, even if it doesn't start as audio, to find patterns and recognize changes
In [3]:
import matplotlib.pyplot as plt
plt.plot(data)
plt.title("audio")
plt.show()
plt.plot(data[24000:25000])
plt.title("waveform")
plt.show()
Fast Fourier Transform (FFT)¶
- efficiently maps time series to frequency distribution
- uses complex numbers to represent phase
- Discrete Cosine Transform (DCT) is real-valued alternative
In [4]:
import matplotlib.pyplot as plt
import numpy as np
fft = np.fft.fft(data)
frequencies = np.fft.fftfreq(len(data),d=1/rate)
plt.plot(frequencies,abs(fft))
plt.xlim(0,2000)
plt.title('FFT')
plt.xlabel('frequency (Hz)')
plt.ylabel('magnitude')
plt.show()
In [5]:
import matplotlib.pyplot as plt
import numpy as np
plt.specgram(data,Fs=rate,scale='linear',cmap='inferno')
plt.ylim(500,8000)
plt.title("spectrogram")
plt.xlabel('time (s)')
plt.ylabel('frequency (Hz)')
plt.ylim(0,2000)
plt.show()
wavelets¶
- time series are a function of time, spectra are a function of frequency; wavelets combine aspects of both by using basis functions that are local in both time and frequency
- PyWavelets
filters¶
- sampling
- aliasing
- sampling too slowly, so that high and low frequencies appear the same
- Nyquist sampling rate
- twice the highest frequency to prevent aliasing
- aliasing
- digital filter
- input $x_n$, output $y_n$, time steps $n$
- $y_n = \sum_{i=0}^M b_i x_{n-i} - \sum_{i=1}^N a_i y_{n-i}$
- common types
- low-pass
- frequencies below a cutoff
- high-pass
- frequencies above a cutoff
- band-pass
- frequencies in a band
- low-pass
- common uses
- reducing noise
- removing unwanted signals, varying baselines
- isolating features of interest
- filter bank
- an array of band-pass filters used for feature detection
- Butterworth filters
- designed to have a flat response in the pass band
In [6]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import butter,filtfilt
#
# generate data for filtering example and plot
#
npts = 1000
noise = 0.2
rate = 20000
order = 5
np.random.seed(50)
x = np.linspace(-10,10,npts)
data = np.tanh(x)+np.random.normal(0,noise,npts)
#
plt.plot(data)
plt.title('time series')
plt.show()
#
fft = np.fft.rfft(data)
frequencies = np.fft.rfftfreq(len(data),d=1/rate)
plt.plot(frequencies,np.abs(fft))
plt.title('time series FFT')
plt.xlabel('frequency (Hz)')
plt.ylabel('magnitude')
plt.xscale('log')
plt.yscale('log')
plt.xlim(10,5000)
plt.show()
#
# filter and plot
#
low = 500
b,a = butter(order,low,btype='low',fs=rate,output='ba',analog=False)
filtlow = filtfilt(b,a,data)
plt.plot(filtlow)
plt.title('low pass time series')
plt.show()
#
fft = np.fft.rfft(filtlow)
frequencies = np.fft.rfftfreq(len(filtlow),d=1/rate)
plt.plot(frequencies,np.abs(fft))
plt.title('low pass FFT')
plt.xlabel('frequency (Hz)')
plt.ylabel('magnitude')
plt.xscale('log')
plt.yscale('log')
plt.xlim(10,5000)
plt.show()
#
high = 500
b,a = butter(order,high,btype='high',fs=rate,output='ba',analog=False)
filthigh = filtfilt(b,a,data)
plt.plot(filthigh)
plt.title('high pass time series')
plt.show()
#
fft = np.fft.rfft(filthigh)
frequencies = np.fft.rfftfreq(len(filthigh),d=1/rate)
plt.plot(frequencies,np.abs(fft))
plt.title('high pass FFT')
plt.xlabel('frequency (Hz)')
plt.ylabel('magnitude')
plt.xscale('log')
plt.yscale('log')
plt.xlim(10,5000)
plt.show()
#
low = 40
high = 750
b,a = butter(order,[low,high],btype='band',fs=rate,output='ba',analog=False)
filtband = filtfilt(b,a,data)
plt.plot(filtband)
plt.title('band pass time series')
plt.show()
#
fft = np.fft.rfft(filtband)
frequencies = np.fft.rfftfreq(len(filtband),d=1/rate)
plt.plot(frequencies,np.abs(fft))
plt.title('band pass FFT')
plt.xlabel('frequency (Hz)')
plt.ylabel('magnitude')
plt.xscale('log')
plt.yscale('log')
plt.xlim(10,5000)
plt.show()
Compressed Sensing¶
- use priors on data to reduce requirements below conventional limits
- $L^2$ norm is isotropic, $L^1$ norm is maximized along axes (vectors are sparse)
- Total Variation (TV): smooth regions with sharp boundaries
- examples
- X-ray computed tomography
- cryo electron microscopy
- sampling below the Nyquist limit
In [7]:
import numpy as np
import matplotlib.pyplot as plt
#
# parameters
#
f1 = 697 # DTMF 1
f2 = 1209 # DTMF 1
fsample = 10000
dt = 1/fsample
tmax = 0.25
fraction = 0.05
np.random.seed(50)
#
# data
#
time = np.arange(0,tmax,dt)
n = len(time)
samples = np.sin(2*np.pi*time*f1)+np.sin(2*np.pi*time*f2)
nplot = 200
plt.ion()
fig,ax = plt.subplots(1,1)
ax.plot(time,samples)
plt.xlim(0,.02)
plt.xlabel('time')
plt.title('DTMF data')
#
# DCT
#
dct = np.zeros((n,n))
for i in range(n):
dct[:,i] = np.sqrt(2/n)*np.cos(np.pi*(2*i+1)*np.arange(n)/(2*n))
dct[0,:] *= 1/np.sqrt(2)
trans = dct@samples
freq = np.arange(n)/(2*tmax)
fig,ax = plt.subplots(1,1)
ax.plot(freq,trans)
plt.xlim(400,1500)
plt.xlabel('frequency')
plt.title('Discrete Cosine Transform')
#
# inverse DCT
#
#inv = np.transpose(dct)@trans
inv = np.matmul(np.transpose(dct),trans)
fig,ax = plt.subplots(1,1)
ax.plot(time,inv)
plt.xlim(0,.02)
plt.xlabel('time')
plt.title('inverse Discrete Cosine Transform')
#
# random sample
#
nrandom = int(n*fraction)
index = (np.random.rand(nrandom)*n).astype(int)
random = samples[index]
plt.ion()
fig,ax = plt.subplots(1,1)
ax.plot(time,samples)
ax.scatter(time[index],random,c='r')
plt.xlim(0,.02)
plt.xlabel('time')
plt.title('random samples')
#
# gradient minimization
#
def gradmin(f,df,start,scale,steps):
x = np.copy(start)
for i in range(steps):
x -= scale*df(x)
return x
#
# minimize error
#
def df(x):
global random,index,dct
inv = np.transpose(dct)@x
grad = np.zeros(n)
for i in range(n):
grad[i] = np.sum(2*(inv[index]-random)*dct[i,index])
return grad
def f(x):
global random,index,dct
inv = np.transpose(dct)@x
fn = np.sum((inv[index]-random)**2)
return fn
start = np.random.rand(n)
scale = 0.1
steps = 100
fit = gradmin(f,df,start,scale,steps)
fig,ax = plt.subplots(1,1)
ax.plot(freq,fit)
plt.xlabel('frequency')
plt.title('minimize DCT error')
plt.xlim(400,1500)
#
# minimize error + L2
#
def df(x):
global random,index,dct
inv = np.transpose(dct)@x
grad = np.zeros(n)
for i in range(n):
grad[i] = np.sum(2*(inv[index]-random)*dct[i,index])+2*x[i]
return grad
def f(x):
global random,index
inv = np.transpose(dct)@x
fn = np.sum((inv[index]-random)**2)+np.sum(x**2)
return fn
start = np.random.rand(n)
scale = 0.1
steps = 100
fit = gradmin(f,df,start,scale,steps)
fig,ax = plt.subplots(1,1)
ax.plot(freq,fit)
plt.xlabel('frequency')
plt.title('minimize DCT error + L2 norm')
plt.xlim(400,1500)
#
# minimize error + L1
#
def df(x):
global random,index,dct
inv = np.transpose(dct)@x
grad = np.zeros(n)
for i in range(n):
grad[i] = np.sum(2*(inv[index]-random)*dct[i,index])+np.sign(x[i])
return grad
def f(x):
global random,index
inv = np.transpose(dct)@x
fn = np.sum((inv[index]-random)**2)+np.sum(np.abs(x))
return fn
start = np.random.rand(n)
scale = 0.1
steps = 100
fit = gradmin(f,df,start,scale,steps)
fig,ax = plt.subplots(1,1)
ax.plot(freq,fit)
plt.xlabel('frequency')
plt.title('minimize DCT error + L1 norm')
plt.xlim(400,1500)
Out[7]:
(400.0, 1500.0)
Assignment¶
- Analyze your data set
- prepare a notebook with the analysis of your data set, store it in your repo, and call it presentation.ipynb
- include a 1920x1080 summary slide describing you, your data, and your analysis, store it your repo's images folder, and call it presentation.png
Review¶
(c) Neil Gershenfeld for Fab Futures, 12/14/25