import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

#load the CSV from the path you specified
df= pd.read_csv("datasets/BTC_USD_full_data.csv")

df['Date'] = pd.to_datetime(df['Date'], errors='coerce')   # parse dates robustly
df = df.dropna(subset=['Date'])                            # drop rows with bad dates
df = df.sort_values('Date').reset_index(drop=True)         # sort by time

# Quick peek
display(df.head())

prices = df['Close'].astype(float).values        # make sure numeric
logp = np.log(prices)                            # stabilize scale with log
logret = np.concatenate([[0.0], np.diff(logp)]) # log returns (first entry = 0)
sma7 = pd.Series(prices).rolling(window=7, min_periods=1).mean().values
vol21 = pd.Series(logret).rolling(window=21, min_periods=1).std().values

# Put into DataFrame indexed by Date for convenience
features = pd.DataFrame({
    "Price": prices,
    "LogPrice": logp,
    "LogReturn": logret,
    "SMA7": sma7,
    "Vol21": vol21
}, index=df['Date'])

# Fill any small gaps if present (updated syntax)
features = features.bfill().ffill()

# Inspect
display(features.head())

#standardize features
scaler = StandardScaler()
X_std = scaler.fit_transform(features.values)  # returns numpy array, shape (n_samples, n_features)

#Put back to a DataFrame for easy inspection of standardized values
features_std = pd.DataFrame(X_std, index=features.index, columns=features.columns)
display(features_std.head())

#Quick sanity check: mean ~ 0, std ~ 1
print("means (approx):", np.round(features_std.mean(), 4).to_dict())
print("stds  (approx):", np.round(features_std.std(ddof=0), 4).to_dict())

means (approx): {'Price': -0.0, 'LogPrice': -0.0, 'LogReturn': -0.0, 'SMA7': 0.0, 'Vol21': -0.0}
stds  (approx): {'Price': 1.0, 'LogPrice': 1.0, 'LogReturn': 1.0, 'SMA7': 1.0, 'Vol21': 1.0}

# SIMPLE PCA ON STANDARDIZED FEATURES


from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# PREPARE INPUT FOR PCA

X = features_std.values          # use already-standardized features (no re-scaling)

#FIT PCA MODEL

pca = PCA(n_components=3)        # keep first 3 principal components
Xpca = pca.fit_transform(X)      # project data into PCA space

#STORE PCA RESULTS AS A DATAFRAME

Xpca_df = pd.DataFrame(
    Xpca,
    index=features_std.index,    # keep original time index
    columns=['PC1', 'PC2', 'PC3']
)

#PLOT EXPLAINED VARIANCE

plt.figure(figsize=(8,4))
plt.plot(np.arange(1, 4), pca.explained_variance_, marker='o')  # variance per PC
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance')
plt.title('PCA Explained Variance')
plt.grid(True)
plt.show()

#PLOT PCA COMPONENTS OVER TIME

plt.figure(figsize=(12,5))
plt.plot(Xpca_df.index, Xpca_df['PC1'], label='PC1')
plt.plot(Xpca_df.index, Xpca_df['PC2'], label='PC2')
plt.plot(Xpca_df.index, Xpca_df['PC3'], label='PC3')
plt.xlabel('Date')
plt.ylabel('Component Value')
plt.title('PCA Components Over Time')
plt.legend()
plt.grid(True)
plt.show()

#SCATTER PLOT: PC1 VS PC2

plt.figure(figsize=(8,6))
plt.scatter(
    Xpca_df['PC1'],              # x-axis: first component
    Xpca_df['PC2'],              # y-axis: second component
    s=10
)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA Scatter: PC1 vs PC2')
plt.grid(True)
plt.show()

#PCA LOADINGS (FEATURE CONTRIBUTIONS)

loadings = pd.DataFrame(
    pca.components_.T,           # transpose for feature x component format
    index=features_std.columns,  # original feature names
    columns=['PC1', 'PC2', 'PC3']
)

display(loadings)

import numpy as np
from IPython.display import Audio, display

# Take your BTC feature (LogReturn) and scale to 0–9
logret = features['LogReturn'].values
scaled = ((logret - logret.min()) / (logret.max() - logret.min()) * 9).astype(int)
digits = ''.join([str(d) for d in scaled])  # convert to string digits

#DTMF tone generator
rate = 44100

def DTMF_tone(digit, duration=0.15, rate=44100, amplitude=0.5):
    freqs = {
        '1':(697,1209),'2':(697,1336),'3':(697,1477),
        '4':(770,1209),'5':(770,1336),'6':(770,1477),
        '7':(852,1209),'8':(852,1336),'9':(852,1477),
        '0':(941,1336)}
    if digit not in freqs:
        return np.zeros(int(rate*duration))
    low, high = freqs[digit]
    t = np.linspace(0, duration, int(rate*duration), endpoint=False)
    return amplitude * (np.sin(2*np.pi*low*t) + np.sin(2*np.pi*high*t))

# Generate DTMF audio
def generate_DTMF(digits, duration=0.15, silence=0.05, rate=44100):
    data = np.array([])
    for d in digits:
        tone = DTMF_tone(d, duration, rate)
        data = np.concatenate((data, tone, np.zeros(int(rate*silence))))
    return data

audio_data = generate_DTMF(digits)

#Normalize audio to -1..1 (safe for IPython Audio)
audio_data = audio_data / np.max(np.abs(audio_data))

#Play audio in notebook
display(Audio(audio_data, rate=rate))

import matplotlib.pyplot as plt
import numpy as np

# Assume `audio_data` is your BTC DTMF audio from before
rate = 44100
duration = 0.15  # tone duration used in your DTMF
silence = 0.05   # silence duration between tones
digit_samples = int((duration + silence) * rate)

# Plot full audio
plt.figure(figsize=(14,4))
plt.plot(np.arange(len(audio_data))/rate, audio_data)
plt.title("BTC DTMF Audio - Full Signal")
plt.xlabel("Time (s)")
plt.ylabel("Amplitude")
plt.grid(True)
plt.show()

#Plot zoomed-in waveforms for first 3 digits
plt.figure(figsize=(14,6))
for i in range(3):
    start = i * digit_samples
    end = start + digit_samples
    plt.plot(np.arange(start, end)/rate, audio_data[start:end], label=f'Digit {i+1}')
plt.title("Zoomed-in BTC DTMF Digits")
plt.xlabel("Time (s)")
plt.ylabel("Amplitude")
plt.legend()
plt.grid(True)
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load BTC data
df = pd.read_csv("datasets/BTC_USD_full_data.csv", parse_dates=['Date'])
df = df.sort_values('Date')

#Calculate daily returns (or log returns)
df['LogReturn'] = np.log(df['Close']).diff()
df = df.dropna(subset=['LogReturn'])

#Convert to numpy array
data = df['LogReturn'].values

# Optional: sampling rate (number of data points per "second")
# Here, we treat each day as 1 unit of time, so Fs=1
rate = 1
plt.figure(figsize=(12,6))

# Plot spectrogram
plt.specgram(data, Fs=rate, scale='linear', cmap='inferno')

# Set axis limits if desired
plt.ylim(-0.01, 0.01)  # for log returns, adjust according to your data range

plt.title("Bitcoin Log Return Spectrogram")
plt.xlabel('Time (days)')
plt.ylabel('Frequency (1/day)')
plt.colorbar(label='Intensity')
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import spectrogram

# Load and prepare BTC data
df = pd.read_csv("datasets/BTC_USD_full_data.csv", parse_dates=['Date'])
df = df.sort_values('Date')
df['LogReturn'] = np.log(df['Close']).diff()
df['LogReturnSmooth'] = df['LogReturn'].rolling(window=7, center=True).mean()
df = df.dropna(subset=['LogReturnSmooth'])
data = df['LogReturnSmooth'].values

# Compute spectrogram
frequencies, times, Sxx = spectrogram(
    data, 
    fs=1, 
    nperseg=256, 
    noverlap=192, 
    scaling='spectrum', 
    mode='magnitude'
)

# Plot
plt.figure(figsize=(14,6))
plt.pcolormesh(times, frequencies, Sxx, shading='gouraud', cmap='magma')
plt.colorbar(label='Intensity (Magnitude)')
plt.title("Clean Bitcoin Log Return Spectrogram")
plt.xlabel('Time (days)')
plt.ylabel('Frequency (1/day)')
plt.ylim(0, 0.02)  # focus on meaningful frequencies
plt.tight_layout()
plt.show()

	Date	Close	High	Low	Open	Volume
0	2020-12-31	29001.720703	29244.876953	28201.992188	28841.574219	4.675496e+10
1	2021-01-01	29374.152344	29600.626953	28803.585938	28994.009766	4.073030e+10
2	2021-01-02	32127.267578	33155.117188	29091.181641	29376.455078	6.786542e+10
3	2021-01-03	32782.023438	34608.558594	32052.316406	32129.408203	7.866524e+10
4	2021-01-04	31971.914062	33440.218750	28722.755859	32810.949219	8.116348e+10

	Price	LogPrice	LogReturn	SMA7	Vol21
Date
2020-12-31	29001.720703	10.275110	0.000000	29001.720703	0.009023
2021-01-01	29374.152344	10.287870	0.012760	29187.936523	0.009023
2021-01-02	32127.267578	10.377460	0.089590	30167.713542	0.048463
2021-01-03	32782.023438	10.397636	0.020175	30821.291016	0.040179
2021-01-04	31971.914062	10.372613	-0.025022	31051.415625	0.042781

	Price	LogPrice	LogReturn	SMA7	Vol21
Date
2020-12-31	-0.846010	-0.846393	-0.019755	-0.844594	-1.703927
2021-01-01	-0.833322	-0.823359	0.395544	-0.838235	-1.703927
2021-01-02	-0.739529	-0.661632	2.896134	-0.804778	1.731121
2021-01-03	-0.717223	-0.625211	0.636887	-0.782460	1.009644
2021-01-04	-0.744821	-0.670382	-0.834162	-0.774601	1.236275

	PC1	PC2	PC3
Price	0.570173	0.011002	0.083890
LogPrice	0.558443	0.024055	0.171266
LogReturn	0.010131	0.997252	-0.062939
SMA7	0.569499	-0.030993	0.085902
Vol21	-0.196502	0.061880	0.975853

Transforms¶

Well, of all the classes that I attended in this course, this was probably the one class where I understood the most concepts introduced.¶

The goal of the class was to visualize the data set we have in the most informative way. This actually brings us to our first class, where Mr.Neil mentioned that the goal of this course was to be able to use computational methods and analysis to get insights from the data.¶

Since my data are Bitcoin prices over the last five years, I encountered very high and very low values, which call for data standardization. This will ensure that large values don't dominate the data transform we will try in this notebook. So, let us try standadizing the data.¶

Standadization¶

Let's now create a small set of features that are simple and interpretable.¶

With our standardization done, we have attained zero mean and unit variance, as you can see in the above output cell.¶

Now, we will try the PCA¶

The PCA will reduce the features into latent factors, which explains the major patterns in the data.¶

Interpretation of the PCA Results¶

PC1 (Primary Market Trend Component)¶

PC2 (Return / Short-Term Fluctuation Component)¶

LogReturn dominates PC2 and shows high-frequency oscillations around zero. This behavior indicates that PC2 captures short-term volatility and return shocks rather than sustained trends. Its lack of persistence suggests it reflects transient market reactions and noise.¶

PC3 (Volatility Structure Component)¶

PC3 loads strongly on Vol21 and shows smaller-scale variations. This component represents volatility intensity independent of price direction, capturing changes in market uncertainty rather than price movement itself.¶

Time-Series Behavior¶

Scatter Plot (PC1 vs PC2)¶

The elongated horizontal structure suggests that variability is much stronger along PC1 than PC2.The absence of clear clusters implies a continuous market state rather than sharply separated regimes, though mild density changes may indicate gradual transitions.¶

Overall Insight¶

The market dynamics are primarily one-dimensional at a structural level, driven by price trend.¶

Returns and volatility act as secondary, fast-moving processes around this dominant trend.¶

PCA confirms that the chosen features contain overlapping information, and that dimensionality reduction is justified for downstream modeling or regime analysis.¶

This makes PCA an effective diagnostic step that validates feature structure and informs later modeling choices such as clustering, regime detection, or sequence models like LSTMs.¶

When you listen to the output:¶

Sonification¶

Now, just trying a simple spectrogram for the data, specifically, for log returns.¶

In efforts to replicate the spectrogram in the class notes, I tried the following code.¶

This spectrogram shows how the frequency content of Bitcoin’s cleaned log returns changes over time. The x-axis represents time in days, the y-axis represents frequency (cycles per day), and the color intensity indicates the magnitude (strength) of those frequency components.¶

Conclusion¶

References¶