import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d
from scipy.stats import norm

# Load and clean
df = pd.read_csv("datasets/BTC_USD_full_data.csv")
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')  ## Changes the date column to datetime objects, and the invalid entries to NaT(not-a-time)
df['Close'] = pd.to_numeric(df['Close'], errors='coerce') ## Changes the closing prices into numbers and the invalid entries to NaN
df = df.dropna(subset=['Date', 'Close']) ##Drops an entry that doesn't have date and close prices

sigma = 15 ## strength of smoothing
y_smooth = gaussian_filter1d(df['Close'].values, sigma=sigma) 

## Plotting two histograms: raw data and smoothed data
plt.figure(figsize=(12,6))
plt.hist(df['Close'], bins=50, density=True, alpha=0.6, color='skyblue', label="Raw Data")
plt.hist(y_smooth, bins=50, density=True, alpha=0.5, color='red', label=f"Smoothed (sigma={sigma})")

plt.xlabel("BTC Closing Price (USD)")
plt.ylabel("Probability Density")
plt.title("Probability Distribution of BTC Closing Prices")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# Estimating parameters of the normal fit
mu, sigma = norm.fit(df['Close'])
print(f"Mean (mu): {mu:.2f}, Standard deviation (sigma): {sigma:.2f}")

## Plot PDF
## Plots histogram for closing prices of Bitcoins and overlays the fitted normal distribution (PDF curve)
plt.figure(figsize=(12,6))
plt.hist(df['Close'], bins=50, density=True, alpha=0.6, color='skyblue', label="Data")
xmin, xmax = df['Close'].min(), df['Close'].max()
x = np.linspace(xmin, xmax, 1000)
pdf = norm.pdf(x, mu, sigma)
plt.plot(x, pdf, 'r-', lw=2, label='Fitted Normal PDF')

plt.xlabel("BTC Closing Price (USD)")
plt.ylabel("Probability Density")
plt.title("BTC Price Distribution with Fitted Normal Curve")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# Probability BTC price is between $30,000 and $35,000
prob = norm.cdf(35000, mu, sigma) - norm.cdf(30000, mu, sigma)
print(f"Probability BTC is between $60k and $40k: {prob:.2%}")

Mean (mu): 53834.80, Standard deviation (sigma): 29353.19

Probability BTC is between $60k and $40k: 5.22%

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import lognorm


# 1. Load and clean the data
df = pd.read_csv("datasets/BTC_USD_full_data.csv")

# Convert 'Date' to datetime and 'Close' to numeric
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Close'] = pd.to_numeric(df['Close'], errors='coerce')

# Drop rows with missing Date or Close
df = df.dropna(subset=['Date', 'Close'])


# 2. Fit log-normal distribution

prices = df['Close'].values

# Fit log-normal distribution (fix loc=0 for standard log-normal)
shape, loc, scale = lognorm.fit(prices, floc=0)
print(f"Log-normal parameters:\nShape (σ) = {shape:.4f}, Scale (exp(μ)) = {scale:.2f}")


# 3. Plot histogram and fitted PDF

plt.figure(figsize=(12,6))
plt.hist(prices, bins=50, density=True, alpha=0.6, color='skyblue', label="BTC Prices")

# Generate PDF for log-normal
x = np.linspace(prices.min(), prices.max(), 1000)
pdf = lognorm.pdf(x, shape, loc=0, scale=scale)
plt.plot(x, pdf, 'r-', lw=2, label="Fitted Log-normal PDF")

plt.xlabel("BTC Closing Price (USD)")
plt.ylabel("Probability Density")
plt.title("BTC Price Distribution with Fitted Log-normal Curve")
plt.legend()
plt.grid(alpha=0.3)
plt.show()


# 4. Probability example

prob_ln = lognorm.cdf(60000, shape, loc=0, scale=scale) - lognorm.cdf(40000, shape, loc=0, scale=scale)
print(f"Probability BTC is between $40k and $60k (log-normal): {prob_ln:.2%}")

Log-normal parameters:
Shape (σ) = 0.5540, Scale (exp(μ)) = 46349.93

Probability BTC is between $40k and $60k (log-normal): 28.43%

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture


# 1. Load and clean BTC data

df = pd.read_csv("datasets/BTC_USD_full_data.csv")  # Load CSV
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')  # Convert Date column to datetime
df['Close'] = pd.to_numeric(df['Close'], errors='coerce')  # Convert Close to numeric
df = df.dropna(subset=['Date', 'Close'])  # Drop rows with missing values

prices = df['Close'].values.reshape(-1, 1)  # Reshape to 2D array for sklearn


# 2. Fit Gaussian Mixture Model (GMM)

n_components = 2  # Number of Gaussian components (can adjust)
gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=42)
gmm.fit(prices)  # Fit GMM to BTC prices using E-M algorithm

# Extract GMM parameters
means = gmm.means_.flatten()          # Mean of each Gaussian component
covariances = gmm.covariances_.flatten()  # Covariance of each component
weights = gmm.weights_.flatten()      # Weight of each Gaussian component

print("GMM Means:", means)
print("GMM Covariances:", covariances)
print("GMM Weights:", weights)


# 3. Visualize the histogram and the GMM PDF

x = np.linspace(prices.min(), prices.max(), 1000)  # Generate x-axis values
pdf = np.exp(gmm.score_samples(x.reshape(-1,1)))  # Compute PDF using GMM

plt.figure(figsize=(12,6))
plt.hist(prices, bins=50, density=True, alpha=0.6, color='skyblue', label="BTC Prices")  # Histogram of prices
plt.plot(x, pdf, 'r-', lw=2, label="GMM Fit (E-M)")  # Overlay GMM PDF
plt.xlabel("BTC Closing Price (USD)")
plt.ylabel("Probability Density")
plt.title(f"BTC Price Distribution (GMM with {n_components} components)")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

GMM Means: [ 41148.55707577 102473.30232336]
GMM Covariances: [2.76787404e+08 1.21048805e+08]
GMM Weights: [0.79313018 0.20686982]

import pandas as pd
import numpy as np


# 1. Load and clean BTC data

df = pd.read_csv("datasets/BTC_USD_full_data.csv")
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
df = df.dropna(subset=['Date', 'Close'])


# 2. Compute daily returns

# Daily return = (current price - previous price) / previous price
returns = df['Close'].pct_change().dropna()  # Drop first NaN


# 3. Calculate mean returns
mean_return = np.mean(returns)
print(f"Mean daily return of BTC: {mean_return:.6f} (~{mean_return*100:.2f}%)")


# 4. Calculate covariance (variance)

covariance = np.cov(returns, rowvar=False)
print(f"Covariance (variance) of BTC daily returns: {covariance:.6f}")


# 5. Calculate standard deviation (optional)

std_dev = np.sqrt(covariance)
print(f"Standard deviation of BTC daily returns: {std_dev:.6f} (~{std_dev*100:.2f}%)")

Mean daily return of BTC: 0.001080 (~0.11%)
Covariance (variance) of BTC daily returns: 0.000946
Standard deviation of BTC daily returns: 0.030753 (~3.08%)

import pandas as pd
import numpy as np
from scipy.stats import entropy


#Load and clean BTC data

df = pd.read_csv("datasets/BTC_USD_full_data.csv")
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
df = df.dropna(subset=['Date', 'Close'])

prices = df['Close'].values


# Discretize data into histogram bins

hist, bin_edges = np.histogram(prices, bins=50, density=True)  # Probability density histogram
hist = hist + 1e-10  # Avoid log(0)


# 3. Compute entropy

H = entropy(hist)  # Entropy in nats (can convert to bits by dividing by ln(2))
print("Entropy of BTC price distribution:", H)

H = entropy(hist)  # Observed entropy

# Max entropy for uniform distribution over the same bins
H_max = np.log(len(hist))  # uniform probability over 50 bins
H_normalized = H / H_max

print(f"Normalized entropy: {H_normalized:.2f}")

Entropy of BTC price distribution: 3.655182082200529
Normalized entropy: 0.93

Probability Distribution¶

Few notes I took to understand Probability distribution¶

A probability distribution is a mathematical function that provides a way of modeling the likelihood of each outcome in a random experiment(GeeksforGeeks, 2025).¶

There are two types of distribution:¶

*Discrete Probability Distribution: assigning a probabilty ot each outcome, which takes countable values like (0,1,2,...) Examples of Discrete Probability are¶

*Bernoulli: Flipping a coin once. The outcome is either Heads (success) or Tails (failure). Probability of Heads = 0.5, Tails = 0.5.¶

*Binomial: Flipping a coin 3 times. How many times will you get Heads? You could get 0, 1, 2, or 3 Heads. Each has a certain probability.¶

*Poisson: Counting how many cars pass by your house in 1 hour. Sometimes 0, 1, 2, or more cars can pass, and we can calculate the chance for each number.¶

*Continuous Probability Distribution: Used for uncountable outcomes like time, height, or temperature.¶

*Uniform Distribution: Example-Random number between 0 and 1. Every number in the interval [0, 1] is equally likely.¶

*Normal (Gaussian) Distribution: Example-Heights of adults. If the average height is 170 cm with a standard deviation of 10 cm, most people are around 170 cm; fewer are much shorter or much taller.¶

*Exponential Distribution: Example-Time between arrivals of buses at a bus stop. Shorter waiting times are more likely than very long ones.¶

*Chi-Square Distribution: Example-Testing if a coin is fair based on multiple flips. Used in hypothesis testing; values are always non-negative.¶

*Beta Distribution: Example-Probability of success in a project based on prior data. Models probabilities that are bounded between 0 and 1.¶

What type of probability distribution is appropriate for my data?¶

So, I tried using the normal Gaussian fit for my data's probability distribution¶

Normal Gaussion Fit¶

Now with this curve, I wanted to see if I am doing the right thing and make sure this normal fit really describes the probability distribution of the data, and in doing so, I came across these two papers:¶

1. Cont, R. Empirical properties of asset returns: stylized facts and statistical issues. Quantitative Finance, 2001.¶

2. Nakamoto, S. Bitcoin: A Peer-to-Peer Electronic Cash System. 2008. https://bitcoin.org/bitcoin.pdf ¶

Though I did not read them in full length, from what I read, I understand that I should not be using a fitted normal distribution for the Bitcoin closing prices. These are few reasons I could gather with assistance from ChatGPT:¶

i. Heavy Tails: Bitcoin returns often have fat tails, meaning extreme events (big spikes or crashes) are more likely than predicted by a normal distribution.¶

ii. Skewness: BTC returns are not symmetric around the mean.¶

iii.Volatility Clustering: Periods of high volatility tend to cluster, violating the independent assumption of normal distribution.¶

iv. A normal distribution was fitted to the closing price data to estimate mean (μ) and standard deviation (σ) which underestimatesrisk and mislead probability calculations given the volatility and the skewness.¶

These reasons led me to look for another probability distribution for my data.¶

As I described my data to ChatGPT, it suggested that the BTC data I have may follow a distribution more similar to a log-normal than a normal distribution.¶

What is a log-normal distribution?¶

According to Wikipedia, a log-normal (or lognormal) distribution is a continuous probability distribution of a random variable whose logarithm is normally distributed. Thus, if the random variable X is log-normally distributed, then Y = ln X has a normal distribution.¶

So, I just wanted to fool aroud and try it on my data¶

Guassion Mixture Model¶

For now, let's shift out focus and try to analyze other parts of the data, like the daily returns of Bitcoins and the Entropy of Raw prices of bitcoins¶

Covariance of the daily returns¶

Since the GMM fit didn’t look very reliable, I decided to look at volatility instead. I calculated the covariances using daily returns because returns change more evenly over time than raw prices. This way, I can better understand how much Bitcoin prices move up and down day by day.¶

For the returns, I subtracted the current price from the previous price and divided the difference by the previous price.¶

Entropy¶

Entropy is the measure of distortion, and in terms of the data, we could call it how unpredictable the data is or how random it is.¶

In other words, the BTC price distribution is spread out over a wide range, and there’s a lot of uncertainty about where the price will be on any given day.¶

Entropy gives me an idea of the uncertainty in Bitcoin prices. By looking at both volatility from returns and entropy from prices, I can get a better picture of how Bitcoin behaves, even though they are different ways of looking at the data.¶

References¶

Probability Distribution¶

Few notes I took to understand Probability distribution¶

A probability distribution is a mathematical function that provides a way of modeling the likelihood of each outcome in a random experiment(GeeksforGeeks, 2025).¶

There are two types of distribution:¶

*Discrete Probability Distribution: assigning a probabilty ot each outcome, which takes countable values like (0,1,2,...) Examples of Discrete Probability are¶

*Bernoulli: Flipping a coin once. The outcome is either Heads (success) or Tails (failure). Probability of Heads = 0.5, Tails = 0.5.¶

*Binomial: Flipping a coin 3 times. How many times will you get Heads? You could get 0, 1, 2, or 3 Heads. Each has a certain probability.¶

*Poisson: Counting how many cars pass by your house in 1 hour. Sometimes 0, 1, 2, or more cars can pass, and we can calculate the chance for each number.¶

*Continuous Probability Distribution: Used for uncountable outcomes like time, height, or temperature.¶

*Uniform Distribution: Example-Random number between 0 and 1. Every number in the interval [0, 1] is equally likely.¶

*Normal (Gaussian) Distribution: Example-Heights of adults. If the average height is 170 cm with a standard deviation of 10 cm, most people are around 170 cm; fewer are much shorter or much taller.¶

*Exponential Distribution: Example-Time between arrivals of buses at a bus stop. Shorter waiting times are more likely than very long ones.¶

*Chi-Square Distribution: Example-Testing if a coin is fair based on multiple flips. Used in hypothesis testing; values are always non-negative.¶

*Beta Distribution: Example-Probability of success in a project based on prior data. Models probabilities that are bounded between 0 and 1.¶

What type of probability distribution is appropriate for my data?¶

So, I tried using the normal Gaussian fit for my data's probability distribution¶

Normal Gaussion Fit¶

Now with this curve, I wanted to see if I am doing the right thing and make sure this normal fit really describes the probability distribution of the data, and in doing so, I came across these two papers:¶

1. Cont, R. Empirical properties of asset returns: stylized facts and statistical issues. Quantitative Finance, 2001.¶

2. Nakamoto, S. Bitcoin: A Peer-to-Peer Electronic Cash System. 2008. https://bitcoin.org/bitcoin.pdf¶

Though I did not read them in full length, from what I read, I understand that I should not be using a fitted normal distribution for the Bitcoin closing prices. These are few reasons I could gather with assistance from ChatGPT:¶

i. Heavy Tails: Bitcoin returns often have fat tails, meaning extreme events (big spikes or crashes) are more likely than predicted by a normal distribution.¶

ii. Skewness: BTC returns are not symmetric around the mean.¶

iii.Volatility Clustering: Periods of high volatility tend to cluster, violating the independent assumption of normal distribution.¶

iv. A normal distribution was fitted to the closing price data to estimate mean (μ) and standard deviation (σ) which underestimatesrisk and mislead probability calculations given the volatility and the skewness.¶

These reasons led me to look for another probability distribution for my data.¶

As I described my data to ChatGPT, it suggested that the BTC data I have may follow a distribution more similar to a log-normal than a normal distribution.¶

What is a log-normal distribution?¶

According to Wikipedia, a log-normal (or lognormal) distribution is a continuous probability distribution of a random variable whose logarithm is normally distributed. Thus, if the random variable X is log-normally distributed, then Y = ln X has a normal distribution.¶

So, I just wanted to fool aroud and try it on my data¶

Guassion Mixture Model¶

For now, let's shift out focus and try to analyze other parts of the data, like the daily returns of Bitcoins and the Entropy of Raw prices of bitcoins¶

Covariance of the daily returns¶

Since the GMM fit didn’t look very reliable, I decided to look at volatility instead. I calculated the covariances using daily returns because returns change more evenly over time than raw prices. This way, I can better understand how much Bitcoin prices move up and down day by day.¶

For the returns, I subtracted the current price from the previous price and divided the difference by the previous price.¶

Entropy¶

Entropy is the measure of distortion, and in terms of the data, we could call it how unpredictable the data is or how random it is.¶

In other words, the BTC price distribution is spread out over a wide range, and there’s a lot of uncertainty about where the price will be on any given day.¶

Entropy gives me an idea of the uncertainty in Bitcoin prices. By looking at both volatility from returns and entropy from prices, I can get a better picture of how Bitcoin behaves, even though they are different ways of looking at the data.¶

References¶

2. Nakamoto, S. Bitcoin: A Peer-to-Peer Electronic Cash System. 2008. https://bitcoin.org/bitcoin.pdf ¶