import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# -------------------------
# 1. Load your data
# -------------------------
# Change the filename to your uploaded CSV
data = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv")  

# Change the column name to the one you want to visualize
column_name = "reading score"

# Extract the data as a NumPy array
x = data[column_name].dropna().values

# -------------------------
# 2. Compute statistics
# -------------------------
mean = np.mean(x)
stddev = np.std(x)

# -------------------------
# 3. Plot histogram and points
# -------------------------
plt.hist(x, bins=30, density=True, alpha=0.6)
plt.plot(x, np.zeros_like(x), '|', ms=10)

# -------------------------
# 4. Plot Gaussian curve
# -------------------------
xi = np.linspace(mean-3*stddev,mean+3*stddev,100)
yi = np.exp(-(xi-mean)**2/(2*stddev**2))/np.sqrt(2*np.pi*stddev**2)
plt.plot(xi,yi,'r')
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# -----------------------------
# 1. Load your uploaded data
# -----------------------------
data = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv")   # ← change this
column = "math score"               # ← change this

# Extract the data and drop missing values
x = data[column].dropna().values

# True mean and width (std dev) from your data
true_mean = np.mean(x)
true_std = np.std(x)

# -----------------------------
# 2. Settings (same as your code)
# -----------------------------
trials = 100
points = np.arange(10, 500, 25)
means = np.zeros((trials, len(points)))

# -----------------------------
# 3. Sampling from YOUR DATA
# -----------------------------
for p in range(len(points)):
    N = points[p]
    for t in range(trials):
        sample = np.random.choice(x, size=N, replace=True)
        means[t, p] = np.mean(sample)

# -----------------------------
# 4. Theoretical curve
# -----------------------------
plt.plot(points, true_mean + true_std / np.sqrt(points), 'r', label='calculated')
plt.plot(points, true_mean - true_std / np.sqrt(points), 'r')

# -----------------------------
# 5. Estimated mean & stddev
# -----------------------------
estimated_mean = np.mean(means, axis=0)
estimated_std = np.std(means, axis=0)

plt.errorbar(points, estimated_mean, yerr=estimated_std,
             fmt='k-o', capsize=7, label='estimated')

# -----------------------------
# 6. Scatter points for each trial
# -----------------------------
for p in range(len(points)):
    plt.plot(np.full(trials, points[p]), means[:, p], 'o', markersize=2)

plt.xlabel('number of samples averaged')
plt.ylabel('mean estimates')
plt.legend()
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# -------------------------------------------------------
# 1. Load your data
# -------------------------------------------------------
data = pd.read_csv("~/work/kelzang-wangdi/datasets/StudentsPerformance.csv")        # ← change this
column = "reading score"                    # ← change this

# Extract the column values and drop NaN
vals = data[column].dropna().values

# -------------------------------------------------------
# 2. Define histogram parameters
# -------------------------------------------------------
nbins = 256
xmin, xmax = np.min(vals), np.max(vals)
x = np.linspace(xmin, xmax, nbins)

print(f"{nbins} bins = {np.log2(nbins):.0f} bits")

# -------------------------------------------------------
# 3. Entropy function
# -------------------------------------------------------
def entropy(dist):
    positives = dist[dist > 0]     # avoid 0·log(0)
    return -np.sum(positives * np.log2(positives))

# -------------------------------------------------------
# 4. Distributions
# -------------------------------------------------------

# Uniform distribution
uniform = np.ones(nbins) / nbins

# Histogram of your data → convert to probability distribution
hist, edges = np.histogram(vals, bins=nbins, range=(xmin, xmax), density=False)
data_dist = hist / np.sum(hist)   # normalize

# One-hot distribution (peak at middle)
onehot = np.zeros(nbins)
onehot[nbins // 2] = 1

# -------------------------------------------------------
# 5. Plotting
# -------------------------------------------------------
fig, axs = plt.subplots(3, 1, figsize=(8, 10))
fig.canvas.header_visible = False

# Uniform
axs[0].bar(x, uniform, width=(xmax-xmin)/nbins)
axs[0].set_title(f"Uniform entropy: {entropy(uniform):.1f} bits")

# Your data
axs[1].bar(x[:-1], data_dist[:-1], width=(xmax-xmin)/nbins)
axs[1].set_title(f"Data entropy ({column}): {entropy(data_dist):.1f} bits")

# One-hot
axs[2].bar(x, onehot, width=(xmax-xmin)/nbins)
axs[2].set_title(f"One-hot entropy: {entropy(onehot):.1f} bits")

plt.tight_layout()
plt.show()

256 bins = 8 bits

Probability¶

Probability in data analyis¶

Averaging¶

Explanation¶

Entropy¶

Explanation on Entropy¶

Reference¶