import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv ('~/work/sonam-dendup/datasets/StudentsPerformance.csv')
df.head(2)

df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

df['parental level of education'].value_counts()

parental level of education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

rating_map =df['parental level of education'].value_counts()
df["math score"].map(rating_map)
math_score = df ['math score']
npts = len(math_score)
mean = np.mean(math_score)
stddev = np.std(math_score)

#Plot
plt.hist(math_score, bins=npts//50, density=True, alpha=0.6, color='gray', edgecolor='black')

plt.plot(math_score, 0*math_score, '|', ms=20, color='black')

xi = np.linspace(mean-3*stddev, mean+3*stddev, 100)
yi = np.exp(-(xi-mean)**2/(2*stddev**2))/np.sqrt(2*np.pi*stddev**2)
plt.plot(xi, yi, 'b', lw=2, label='Gaussian Fit')

plt.xlabel("Math Score")
plt.ylabel("Density")
plt.title("Density Estimation: Gaussian Curve")
plt.legend()
plt.savefig("fig2.1.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()

rating_map =df['parental level of education'].value_counts()
df["math score"].map(rating_map)
math_score = df ['math score']

trials = 100
points = np.arange(1, len(math_score)+1)  # sample sizes from 1 to number of students
means = np.zeros((trials, len(points)))

for i, n in enumerate(points):
    for t in range(trials):
        means[t, i] = np.mean(np.random.choice(math_score, size=n, replace=True))  # sampling with replacement

mean_estimates = np.mean(means, axis=0)
std_estimates = np.std(means, axis=0)

plt.errorbar(points, mean_estimates, yerr=std_estimates, fmt='k-o', capsize=5, label='estimated')

mean_score = np.mean(math_score)
std_score = np.std(math_score)
plt.plot(points, mean_score + std_score/np.sqrt(points), 'r', label='calculated')
plt.plot(points, mean_score - std_score/np.sqrt(points), 'r')

for i, n in enumerate(points):
    plt.plot(np.full(trials, n), means[:, i], 'o', markersize=3, alpha=0.5)

plt.xlabel("number of samples averaged")
plt.ylabel("mean estimates of Math score")
plt.title('Averaging')
plt.legend()
plt.show()

df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

# Multidimensional distributions
np.random.seed(10)
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)
npts = 100
# variance samples

rating_map = df['parental level of education'].value_counts()
df["math score"].map(rating_map)
df["reading score"].map(rating_map)
df["writing score"].map(rating_map)
varsamples = df[['math score', 'reading score']].values
#  mean and variance
varmean = np.mean(varsamples,axis=0)
varstd = np.sqrt(np.var(varsamples,axis=0))
varplotx = [varmean[0]-varstd[0],varmean[0]+varstd[0],None,varmean[0],varmean[0]]
varploty = [varmean[1],varmean[1],None,varmean[1]-varstd[1],varmean[1]+varstd[1]]

# covariance samples
mean = [3,5]
covariance = [[2, 5],[5,2]]
covarsamples = np.random.multivariate_normal(mean,covariance,npts)

# find mean, covariance, eigenvalues, and eigenvectors
covarmean = np.mean(covarsamples,axis=0)
covar = np.cov(covarsamples,rowvar=False)
evalu,evect = np.linalg.eig(covar)
dx0 = evect[0,0]*np.sqrt(evalu[0])
dx1 = evect[1,0]*np.sqrt(evalu[1])
dy0 = evect[0,1]*np.sqrt(evalu[0])
dy1 = evect[1,1]*np.sqrt(evalu[1])
covarplotx = [covarmean[0]-dx0,covarmean[0]+dx0,None,covarmean[0]-dx1,covarmean[0]+dx1]
covarploty = [covarmean[1]+dy0,covarmean[1]-dy0,None,covarmean[1]+dy1,covarmean[1]-dy1]

# plot and print
print("covariance matrix:")
print(covar)
samples = np.vstack((varsamples,covarsamples))
plt.figure()
plt.hist2d(samples[:,0],samples[:,1],bins=30,cmap='binary')
plt.plot(samples[:,0],samples[:,1],'o',markersize=1.5,alpha=0.3)
plt.plot(varmean[0],varmean[1],'ro')
plt.plot(covarmean[0],covarmean[1],'ro')
plt.plot(varplotx,varploty,'r')
plt.plot(covarplotx,covarploty,'r')
plt.text(2.5,-4,"variance",fontsize=15)
plt.text(-1.25,8.5,"covariance",fontsize=15)
plt.axis('off')
plt.show()
# print covariance matrices
print("covariance matrix:")
varcovar = np.cov(varsamples,rowvar=False)
print(varcovar)

covariance matrix:
[[4.08 1.84]
 [1.84 5.4 ]]

/tmp/ipykernel_8405/1172678359.py:22: RuntimeWarning: covariance is not symmetric positive-semidefinite.
  covarsamples = np.random.multivariate_normal(mean,covariance,npts)

covariance matrix:
[[229.92 181.  ]
 [181.   213.17]]

nbins = 250
xmin = np.min(df['reading score'])
xmax = np.max(df['reading score'])

x = np.linspace(xmin, xmax, nbins)
print(f"{nbins} bins = {np.log2(nbins):.0f} bits")

def entropy(dist):
    positives = dist[np.where(dist > 0)] # 0 log(0) = 0
    return -np.sum(positives * np.log2(positives))

uniform = np.ones(nbins) / nbins
mean = np.mean(df['reading score'])
std = df['reading score'].std()
normal = np.exp(-(-x - mean)**2 / (2 * std**2))
normal = normal / np.sum(normal)
onehot = np.zeros(nbins)
onehot[nbins//2] = 1

#Plotting 
fig, axs = plt.subplots(3, 1)
fig.canvas.header_visible = False
# width calculation might need adjustment depending on your exact data range
width = 1.5 * (xmax - xmin) / nbins 

axs[0].bar(x, uniform, width=width)
axs[0].set_title(f"uniform entropy: {entropy(uniform):.1f} bits")
axs[1].bar(x, normal, width=width)
axs[1].set_title(f"Gaussian entropy: {entropy(normal):.1f} bits")
axs[2].bar(x, onehot, width=width)
axs[2].set_title(f"one-hot entropy: {entropy(onehot):.1f} bits")
plt.tight_layout()
plt.show()

250 bins = 8 bits

from sklearn.cluster import KMeans
from scipy.spatial import Voronoi, voronoi_plot_2d

#Load  data
# We generate data that naturally forms clusters for better visualization
np.random.seed(42)
cluster_1 = np.random.normal(loc=[60, 60], scale=8, size=(50, 2))
cluster_2 = np.random.normal(loc=[80, 80], scale=5, size=(50, 2))
cluster_3 = np.random.normal(loc=[40, 80], scale=6, size=(50, 2))

X = np.concatenate([cluster_1, cluster_2, cluster_3])
df = pd.DataFrame(X, columns=['reading score', 'writing score'])

# Apply K-Means Clustering

K = 3 # We choose 3 clusters
kmeans = KMeans(n_clusters=K, random_state=42, n_init=10)
df['cluster_labels'] = kmeans.fit_predict(X)
centers = kmeans.cluster_centers_

print(f"Cluster Centers:\n{centers}")

# Generate and Plot Voronoi Tessellation 
plt.figure(figsize=(10, 8))
# Calculate the Voronoi diagram based on the final K-Means cluster centers
vor = Voronoi(centers)
# Plot the Voronoi diagram boundaries (the "tessellation" or decision boundaries)
voronoi_plot_2d(vor, ax=plt.gca(), show_points=False, show_vertices=False, linestyle='--', color='gray', linewidth=2)

# Plot the original data points, colored by their assigned cluster
plt.scatter(df['reading score'], df['writing score'], 
            c=df['cluster_labels'], cmap='viridis', 
            s=50, alpha=0.8, edgecolor='k', label='Data Points')

# Plot the cluster centers (anchors) as large markers
plt.scatter(centers[:, 0], centers[:, 1], 
            marker='X', s=200, color='red', 
            edgecolors='black', linewidths=2, label='Cluster Anchors')

plt.title(f"K-Means Clustering (K={K}) with Voronoi Tessellation Boundaries")
plt.xlabel("Reading Score")
plt.ylabel("Writing Score")
plt.legend()
plt.xlim(X[:, 0].min()-2, X[:, 0].max()+2)
plt.ylim(X[:, 1].min()-2, X[:, 1].max()+2)


plt.savefig("fig5.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()

Cluster Centers:
[[79.52273561 80.70031026]
 [40.52452521 80.09126579]
 [59.16375026 59.3376695 ]]

from sklearn.mixture import GaussianMixture

# Load or Generate Example Data 

data = df['math score'].dropna()

# Format Data for Scikit-learn 
# Reshape the single column into a 2D array:
X = data.values.reshape(-1, 1)

#  Apply Expectation-Maximization (GaussianMixture)
K = 3 
gmm = GaussianMixture(n_components=K, random_state=42, init_params='kmeans')

# Fit the model to the data
gmm.fit(X)

# Predict the cluster assignments (hard labels) and responsibilities
df['gmm_labels'] = gmm.predict(X)
probabilities = gmm.predict_proba(X) # The 'latent variables' in the E-step

print(f"Fitted Means (μ): \n{gmm.means_.flatten()}")
print(f"Fitted Standard Deviations (σ): \n{np.sqrt(gmm.covariances_).flatten()}")
print(f"Cluster Weights/Priors: \n{gmm.weights_.flatten()}")
print("\nData points per most likely cluster:")
print(df['gmm_labels'].value_counts().sort_index())

# isualize the Results 
plt.figure(figsize=(10, 6))
# Plot the histogram
plt.hist(data, bins=30, density=True, alpha=0.5, color='gray', label='Actual Data Histogram')

# Plot the individual fitted Gaussian components and the total mixture density
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
logprob = gmm.score_samples(x.reshape(-1, 1))
pdf = np.exp(logprob)

plt.plot(x, pdf, '-k', linewidth=2, label='Total GMM Density')

# Plot each component
for i in range(K):
    # Calculate the PDF for component 'i', weighted by its probability (gmm.weights_[i])
    component_pdf = gmm.weights_[i] * (1 / (np.sqrt(2 * np.pi * gmm.covariances_[i][0]) * gmm.weights_[i])) * np.exp(-(x - gmm.means_[i][0])**2 / (2 * gmm.covariances_[i][0]))
    plt.plot(x, component_pdf, '--', label=f'Component {i+1}')

plt.title(f"E-M Clustering using Gaussian Mixture Model (K={K})")
plt.xlabel("Math Score")
plt.ylabel("Density")
plt.legend()
plt.show()

Fitted Means (μ): 
[70.40285703 89.55344804 49.20293563]
Fitted Standard Deviations (σ): 
[3.80376891 5.31812677 4.89941658]
Cluster Weights/Priors: 
[0.31603455 0.33916601 0.34479944]

Data points per most likely cluster:
gmm_labels
0    48
1    50
2    52
Name: count, dtype: int64

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
0	female	group B	bachelor's degree	standard	none	72	72	74
1	female	group C	some college	standard	completed	69	90	88

Week03 :Data Science: Probability¶