<Home
Probability 02/12/2025¶
In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# -----------------------------
# Load CSV data
# -----------------------------
df = pd.read_csv("datasets/youtube_video.csv")
# Use view_count as data
x = df["view_count"].values
# -----------------------------
# Compute statistics
# -----------------------------
mean = np.mean(x)
stddev = np.std(x)
npts = len(x)
# -----------------------------
# Plot histogram and points
# -----------------------------
plt.hist(x, bins=npts // 50, density=True)
plt.plot(x, 0 * x, '|', ms=5)
# -----------------------------
# Plot Gaussian curve
# -----------------------------
xi = np.linspace(mean - 3 * stddev, mean + 3 * stddev, 200)
yi = np.exp(-(xi - mean)**2 / (2 * stddev**2)) / np.sqrt(2 * np.pi * stddev**2)
plt.plot(xi, yi)
# -----------------------------
# Labels
# -----------------------------
plt.xlabel("View Count")
plt.ylabel("Probability Density")
plt.title("Histogram of YouTube View Count with Gaussian Fit")
plt.show()
In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
# Load CSV file
df = pd.read_csv("datasets/youtube_video.csv") # change filename if needed
# Select numeric column (REPLACE COLUMN_NAME)
data = df["view_count"].dropna()
# Calculate mean and standard deviation
mean = np.mean(data)
std = np.std(data)
# Create x values for normal curve
x = np.linspace(data.min(), data.max(), 300)
pdf = norm.pdf(x, mean, std)
# Plot histogram (density)
plt.figure(figsize=(10, 6))
plt.hist(data, bins=30, density=True)
# Plot normal distribution curve
plt.plot(x, pdf)
# Rug plot
plt.plot(data, np.zeros_like(data), '|')
# Labels
plt.xlabel("Values")
plt.ylabel("Density")
plt.title("Histogram with Normal Distribution Curve")
plt.show()