[Wangd Lhamo] - Fab Futures - Data Science
Home About

Transforms¶

A transform is any change you apply to your data to make it cleaner, more meaningful, or more suitable for machine learning.

df['Marks'] = df['Marks'].astype(float)

from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline

pipeline = Pipeline([ ('scale', StandardScaler()), # transform step ('model', LogisticRegression()) ])

In [ ]:
### Fast Fourier Transform
- is a mathematical algorithm that effeciently computes the Discrete Fourier Transform (DFT) of a signal 
Fourier Transform: The Concept
A Fourier Transform converts a time-domain signal into a frequency-domain representation.
It tells you what frequencies are present in your signal and their amplitudes.
Example: A sound wave over time → shows which notes (frequencies) are in the sound.

    

import numpy as np import matplotlib.pyplot as plt

Sample signal: 2 sine waves at 5Hz and 20Hz¶

fs = 100 # Sampling frequency (samples per second) t = np.arange(0, 1, 1/fs) signal = np.sin(2np.pi5t) + 0.5np.sin(2np.pi20*t)

Compute FFT¶

fft_values = np.fft.fft(signal) frequencies = np.fft.fftfreq(len(t), 1/fs)

Plot magnitude spectrum¶

plt.plot(frequencies[:len(t)//2], np.abs(fft_values)[:len(t)//2]) plt.title("FFT of Signal") plt.xlabel("Frequency (Hz)") plt.ylabel("Amplitude") plt.show()

Dataset¶

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Example: Student marks dataset
df = pd.DataFrame({
    "Student": ["S1","S2","S3","S4","S5","S6","S7","S8"],
    "Marks": [75, 85, 60, 90, 70, 80, 65, 95]
})

marks = df["Marks"].values
# Apply FFT
fft_values = np.fft.fft(marks)
frequencies = np.fft.fftfreq(len(marks))  # Normalized frequency (cycles per sample)
plt.figure(figsize=(8,5))
plt.stem(frequencies, np.abs(fft_values), use_line_collection=True)
plt.title("FFT of Student Marks")
plt.xlabel("Frequency (cycles per sample)")
plt.ylabel("Amplitude")
plt.show()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[3], line 16
     14 frequencies = np.fft.fftfreq(len(marks))  # Normalized frequency (cycles per sample)
     15 plt.figure(figsize=(8,5))
---> 16 plt.stem(frequencies, np.abs(fft_values), use_line_collection=True)
     17 plt.title("FFT of Student Marks")
     18 plt.xlabel("Frequency (cycles per sample)")

TypeError: stem() got an unexpected keyword argument 'use_line_collection'
<Figure size 800x500 with 0 Axes>
In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Sample student marks dataset
df = pd.DataFrame({
    "Student": ["S1","S2","S3","S4","S5","S6","S7","S8"],
    "Marks": [75, 85, 60, 90, 70, 80, 65, 95]
})

marks = df["Marks"].values

# Compute FFT
fft_values = np.fft.fft(marks)
frequencies = np.fft.fftfreq(len(marks))  # Normalized frequency

# Plot FFT magnitude spectrum
plt.figure(figsize=(8,5))
plt.stem(frequencies, np.abs(fft_values))  # removed use_line_collection
plt.title("FFT of Student Marks")
plt.xlabel("Frequency (cycles per sample)")
plt.ylabel("Amplitude")
plt.show()
No description has been provided for this image
In [5]:
N = len(marks)
plt.figure(figsize=(8,5))
plt.stem(frequencies[:N//2], np.abs(fft_values)[:N//2])
plt.title("FFT of Student Marks (Positive Frequencies)")
plt.xlabel("Frequency")
plt.ylabel("Amplitude")
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [2]:
import pandas as pd

df = pd.DataFrame({
    'Name': ['Pema', 'Sonam', 'Karma', 'Thinley'],
    'Marks': [55, 72, 88, 45],
    'Gender': ['F', 'M', 'M', 'M']
})

df
Out[2]:
Name Marks Gender
0 Pema 55 F
1 Sonam 72 M
2 Karma 88 M
3 Thinley 45 M

Transform: Conver Data Type¶

Marks from integer to float

In [11]:
df['Marks'] = df['Marks'].astype(float)
df
Out[11]:
Name Marks Gender Marks_scaled
0 Pema 55.0 F -0.609145
1 Sonam 72.0 M 0.426401
2 Karma 88.0 M 1.401033
3 Thinley 45.0 M -1.218290

Scaling¶

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df['Marks_scaled'] = scaler.fit_transform(df[['Marks']])
df
Out[9]:
Name Marks Gender Marks_scaled
0 Pema 55.0 F -0.609145
1 Sonam 72.0 M 0.426401
2 Karma 88.0 M 1.401033
3 Thinley 45.0 M -1.218290
In [7]:
# =========================================================
# MNIST IMAGE FFT VISUALIZATION - ALL IN ONE
# =========================================================

import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist

# -------------------------------
# 1. Load MNIST dataset
# -------------------------------
(X_train, y_train), (_, _) = mnist.load_data()
img = X_train[0]       # First image
label = y_train[0]

# -------------------------------
# 2. Compute 2D FFT
# -------------------------------
fft_img = np.fft.fft2(img)
fft_shifted = np.fft.fftshift(fft_img)  # center zero frequency
magnitude_spectrum = np.log(np.abs(fft_shifted) + 1)

# -------------------------------
# 3. Low-Pass and High-Pass Filters
# -------------------------------
rows, cols = img.shape
crow, ccol = rows//2, cols//2

# Low-pass filter mask (center 20x20)
mask_low = np.zeros((rows, cols))
mask_low[crow-10:crow+10, ccol-10:ccol+10] = 1

# High-pass filter mask (remove center)
mask_high = 1 - mask_low

# Apply masks
fft_low = fft_shifted * mask_low
fft_high = fft_shifted * mask_high

# Reconstruct images
img_low = np.fft.ifft2(np.fft.ifftshift(fft_low)).real
img_high = np.fft.ifft2(np.fft.ifftshift(fft_high)).real

# -------------------------------
# 4. Plot all images side-by-side
# -------------------------------
plt.figure(figsize=(16,4))

# Original image
plt.subplot(1,4,1)
plt.imshow(img, cmap='gray')
plt.title(f"Original Image - Label {label}")
plt.axis('off')

# FFT Magnitude Spectrum
plt.subplot(1,4,2)
plt.imshow(magnitude_spectrum, cmap='gray')
plt.title("FFT Magnitude Spectrum")
plt.axis('off')

# Low-pass filtered
plt.subplot(1,4,3)
pl
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[7], line 7
      5 import numpy as np
      6 import matplotlib.pyplot as plt
----> 7 from tensorflow.keras.datasets import mnist
      9 # -------------------------------
     10 # 1. Load MNIST dataset
     11 # -------------------------------
     12 (X_train, y_train), (_, _) = mnist.load_data()

ModuleNotFoundError: No module named 'tensorflow'
In [ ]:
 
In [ ]:
 

Animated Spectrogram Example¶

In [8]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import spectrogram

# Create a sample signal: 5 seconds, 2 Hz and 5 Hz components
fs = 1000  # Sampling frequency in Hz
t = np.linspace(0, 5, 5*fs)
signal = np.sin(2*np.pi*2*t) + 0.5*np.sin(2*np.pi*5*t)

# Compute spectrogram
frequencies, times, Sxx = spectrogram(signal, fs)

# Plot spectrogram
plt.figure(figsize=(10, 6))
plt.pcolormesh(times, frequencies, 10 * np.log10(Sxx), shading='gouraud')
plt.ylabel('Frequency [Hz]')
plt.xlabel('Time [sec]')
plt.title('Spectrogram of Synthetic Signal')
plt.colorbar(label='Power/Frequency (dB/Hz)')
plt.show()
No description has been provided for this image
In [ ]:
 

DTMF¶

In [11]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io.wavfile import write
from scipy.signal import spectrogram

# Sampling parameters
fs = 8000        # Sampling frequency
duration = 0.5   # Duration of each tone (seconds)

# DTMF frequency mapping
dtmf_freqs = {
    '1': (697, 1209),
    '2': (697, 1336),
    '3': (697, 1477),
    '4': (770, 1209),
    '5': (770, 1336),
    '6': (770, 1477),
    '7': (852, 1209),
    '8': (852, 1336),
    '9': (852, 1477),
    '*': (941, 1209),
    '0': (941, 1336),
    '#': (941, 1477)
}

# Function to generate a DTMF tone for a key
def generate_dtmf(key):
    t = np.linspace(0, duration, int(fs*duration), endpoint=False)
    f1, f2 = dtmf_freqs[key]
    tone = np.sin(2*np.pi*f1*t) + np.sin(2*np.pi*f2*t)
    return tone

# Example: generate a sequence '123#'
sequence = '123#'
signal = np.concatenate([generate_dtmf(k) for k in sequence])

# Normalize
signal = signal / np.max(np.abs(signal))

# Save as WAV file
write('dtmf_sequence.wav', fs, (signal * 32767).astype(np.int16))

# Plot spectrogram
frequencies, times, Sxx = spectrogram(signal, fs)
plt.figure(figsize=(10,6))
plt.pcolormesh(times, frequencies, 10*np.log10(Sxx), shading='gouraud')
plt.title('Spectrogram of DTMF Sequence')
plt.xlabel('Time [s]')
plt.ylabel('Frequency [Hz]')
plt.colorbar(label='Power/Frequency (dB/Hz)')
plt.ylim(500, 1600)
plt.show()
No description has been provided for this image
In [12]:
Explanation

Signal creation:

First 2.5 seconds → 2 Hz sine wave.
After 2.5 seconds → 5 Hz sine wave.

This simulates a frequency appearing over time.
Spectrogram computation:

spectrogram(signal[:frame], fs) computes the spectrogram for the signal up to the current frame.
Animation:
FuncAnimation updates the plot frame by frame, showing how the spectrogram changes as the new frequency appears.
Result:
You’ll see the 2 Hz line first, then the 5 Hz line appears halfway, which visually demonstrates the power of spectrograms.
  Cell In[12], line 5
    First 2.5 seconds → 2 Hz sine wave.
                      ^
SyntaxError: invalid character '→' (U+2192)
In [13]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile

# DTMF frequency mapping
dtmf_freqs = {
    (697, 1209): '1', (697, 1336): '2', (697, 1477): '3',
    (770, 1209): '4', (770, 1336): '5', (770, 1477): '6',
    (852, 1209): '7', (852, 1336): '8', (852, 1477): '9',
    (941, 1209): '*', (941, 1336): '0', (941, 1477): '#'
}

low_freqs = np.array([697, 770, 852, 941])
high_freqs = np.array([1209, 1336, 1477])

# Read the audio file
fs, audio = wavfile.read("dtmf_sequence.wav")

# If stereo, convert to mono
if len(audio.shape) > 1:
    audio = audio.mean(axis=1)

# Split signal into chunks (assuming ~0.5s per key)
chunk_size = int(0.5 * fs)
num_chunks = len(audio) // chunk_size

detected_keys = []

for i in range(num_chunks):
    chunk = audio[i*chunk_size : (i+1)*chunk_size]
    # Compute FFT
    fft_vals = np.fft.fft(chunk)
    fft_freqs = np.fft.fftfreq(len(chunk), 1/fs)
    
    # Take only positive frequencies
    fft_vals = np.abs(fft_vals[:len(chunk)//2])
    fft_freqs = fft_freqs[:len(chunk)//2]
    
    # Find peaks corresponding to DTMF frequencies
    low_peak = low_freqs[np.argmin([abs(f - fft_freqs[np.argmax(fft_vals[(fft_freqs>=f-10)&(fft_freqs<=f+10)])]) for f in low_freqs])]
    high_peak = high_freqs[np.argmin([abs(f - fft_freqs[np.argmax(fft_vals[(fft_freqs>=f-10)&(fft_freqs<=f+10)])]) for f in high_freqs])]
    
    detected_keys.append(dtmf_freqs[(low_peak, high_peak)])

print("Detected DTMF sequence:", "".join(detected_keys))

# Optional: plot spectrogram for visualization
from scipy.signal import spectrogram
frequencies, times, Sxx = spectrogram(audio, fs)
plt.figure(figsize=(10,6))
plt.pcolormesh(times, frequencies, 10*np.log10(Sxx), shading='gouraud')
plt.title('Spectrogram of DTMF Audio')
plt.xlabel('Time [s]')
plt.ylabel('Frequency [Hz]')
plt.colorbar(label='Power/Frequency (dB/Hz)')
plt.ylim(500, 1600)
plt.show()
Detected DTMF sequence: 1111
No description has been provided for this image
In [ ]: