Week03 :Data Science: Probability¶
In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [11]:
df = pd.read_csv ('~/work/sonam-dendup/datasets/StudentsPerformance.csv')
df.head(2)
Out[11]:
| gender | race/ethnicity | parental level of education | lunch | test preparation course | math score | reading score | writing score | |
|---|---|---|---|---|---|---|---|---|
| 0 | female | group B | bachelor's degree | standard | none | 72 | 72 | 74 |
| 1 | female | group C | some college | standard | completed | 69 | 90 | 88 |
In [12]:
df.columns
Out[12]:
Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
'test preparation course', 'math score', 'reading score',
'writing score'],
dtype='object')
In [13]:
df['parental level of education'].value_counts()
Out[13]:
parental level of education some college 226 associate's degree 222 high school 196 some high school 179 bachelor's degree 118 master's degree 59 Name: count, dtype: int64
In [14]:
rating_map =df['parental level of education'].value_counts()
df["math score"].map(rating_map)
math_score = df ['math score']
npts = len(math_score)
mean = np.mean(math_score)
stddev = np.std(math_score)
#Plot
plt.hist(math_score, bins=npts//50, density=True, alpha=0.6, color='gray', edgecolor='black')
plt.plot(math_score, 0*math_score, '|', ms=20, color='black')
xi = np.linspace(mean-3*stddev, mean+3*stddev, 100)
yi = np.exp(-(xi-mean)**2/(2*stddev**2))/np.sqrt(2*np.pi*stddev**2)
plt.plot(xi, yi, 'b', lw=2, label='Gaussian Fit')
plt.xlabel("Math Score")
plt.ylabel("Density")
plt.title("Density Estimation: Gaussian Curve")
plt.legend()
plt.savefig("fig2.1.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()
In [15]:
rating_map =df['parental level of education'].value_counts()
df["math score"].map(rating_map)
math_score = df ['math score']
trials = 100
points = np.arange(1, len(math_score)+1) # sample sizes from 1 to number of students
means = np.zeros((trials, len(points)))
for i, n in enumerate(points):
for t in range(trials):
means[t, i] = np.mean(np.random.choice(math_score, size=n, replace=True)) # sampling with replacement
mean_estimates = np.mean(means, axis=0)
std_estimates = np.std(means, axis=0)
plt.errorbar(points, mean_estimates, yerr=std_estimates, fmt='k-o', capsize=5, label='estimated')
mean_score = np.mean(math_score)
std_score = np.std(math_score)
plt.plot(points, mean_score + std_score/np.sqrt(points), 'r', label='calculated')
plt.plot(points, mean_score - std_score/np.sqrt(points), 'r')
for i, n in enumerate(points):
plt.plot(np.full(trials, n), means[:, i], 'o', markersize=3, alpha=0.5)
plt.xlabel("number of samples averaged")
plt.ylabel("mean estimates of Math score")
plt.title('Averaging')
plt.legend()
plt.show()
In [16]:
df.columns
Out[16]:
Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
'test preparation course', 'math score', 'reading score',
'writing score'],
dtype='object')
In [17]:
# Multidimensional distributions
np.random.seed(10)
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)
npts = 100
# variance samples
rating_map = df['parental level of education'].value_counts()
df["math score"].map(rating_map)
df["reading score"].map(rating_map)
df["writing score"].map(rating_map)
varsamples = df[['math score', 'reading score']].values
# mean and variance
varmean = np.mean(varsamples,axis=0)
varstd = np.sqrt(np.var(varsamples,axis=0))
varplotx = [varmean[0]-varstd[0],varmean[0]+varstd[0],None,varmean[0],varmean[0]]
varploty = [varmean[1],varmean[1],None,varmean[1]-varstd[1],varmean[1]+varstd[1]]
# covariance samples
mean = [3,5]
covariance = [[2, 5],[5,2]]
covarsamples = np.random.multivariate_normal(mean,covariance,npts)
# find mean, covariance, eigenvalues, and eigenvectors
covarmean = np.mean(covarsamples,axis=0)
covar = np.cov(covarsamples,rowvar=False)
evalu,evect = np.linalg.eig(covar)
dx0 = evect[0,0]*np.sqrt(evalu[0])
dx1 = evect[1,0]*np.sqrt(evalu[1])
dy0 = evect[0,1]*np.sqrt(evalu[0])
dy1 = evect[1,1]*np.sqrt(evalu[1])
covarplotx = [covarmean[0]-dx0,covarmean[0]+dx0,None,covarmean[0]-dx1,covarmean[0]+dx1]
covarploty = [covarmean[1]+dy0,covarmean[1]-dy0,None,covarmean[1]+dy1,covarmean[1]-dy1]
# plot and print
print("covariance matrix:")
print(covar)
samples = np.vstack((varsamples,covarsamples))
plt.figure()
plt.hist2d(samples[:,0],samples[:,1],bins=30,cmap='binary')
plt.plot(samples[:,0],samples[:,1],'o',markersize=1.5,alpha=0.3)
plt.plot(varmean[0],varmean[1],'ro')
plt.plot(covarmean[0],covarmean[1],'ro')
plt.plot(varplotx,varploty,'r')
plt.plot(covarplotx,covarploty,'r')
plt.text(2.5,-4,"variance",fontsize=15)
plt.text(-1.25,8.5,"covariance",fontsize=15)
plt.axis('off')
plt.show()
# print covariance matrices
print("covariance matrix:")
varcovar = np.cov(varsamples,rowvar=False)
print(varcovar)
covariance matrix: [[4.08 1.84] [1.84 5.4 ]]
/tmp/ipykernel_8405/1172678359.py:22: RuntimeWarning: covariance is not symmetric positive-semidefinite. covarsamples = np.random.multivariate_normal(mean,covariance,npts)
covariance matrix: [[229.92 181. ] [181. 213.17]]
In [18]:
nbins = 250
xmin = np.min(df['reading score'])
xmax = np.max(df['reading score'])
x = np.linspace(xmin, xmax, nbins)
print(f"{nbins} bins = {np.log2(nbins):.0f} bits")
def entropy(dist):
positives = dist[np.where(dist > 0)] # 0 log(0) = 0
return -np.sum(positives * np.log2(positives))
uniform = np.ones(nbins) / nbins
mean = np.mean(df['reading score'])
std = df['reading score'].std()
normal = np.exp(-(-x - mean)**2 / (2 * std**2))
normal = normal / np.sum(normal)
onehot = np.zeros(nbins)
onehot[nbins//2] = 1
#Plotting
fig, axs = plt.subplots(3, 1)
fig.canvas.header_visible = False
# width calculation might need adjustment depending on your exact data range
width = 1.5 * (xmax - xmin) / nbins
axs[0].bar(x, uniform, width=width)
axs[0].set_title(f"uniform entropy: {entropy(uniform):.1f} bits")
axs[1].bar(x, normal, width=width)
axs[1].set_title(f"Gaussian entropy: {entropy(normal):.1f} bits")
axs[2].bar(x, onehot, width=width)
axs[2].set_title(f"one-hot entropy: {entropy(onehot):.1f} bits")
plt.tight_layout()
plt.show()
250 bins = 8 bits
Data Science: Density Estimation
In [21]:
from sklearn.cluster import KMeans
from scipy.spatial import Voronoi, voronoi_plot_2d
#Load data
# We generate data that naturally forms clusters for better visualization
np.random.seed(42)
cluster_1 = np.random.normal(loc=[60, 60], scale=8, size=(50, 2))
cluster_2 = np.random.normal(loc=[80, 80], scale=5, size=(50, 2))
cluster_3 = np.random.normal(loc=[40, 80], scale=6, size=(50, 2))
X = np.concatenate([cluster_1, cluster_2, cluster_3])
df = pd.DataFrame(X, columns=['reading score', 'writing score'])
# Apply K-Means Clustering
K = 3 # We choose 3 clusters
kmeans = KMeans(n_clusters=K, random_state=42, n_init=10)
df['cluster_labels'] = kmeans.fit_predict(X)
centers = kmeans.cluster_centers_
print(f"Cluster Centers:\n{centers}")
# Generate and Plot Voronoi Tessellation
plt.figure(figsize=(10, 8))
# Calculate the Voronoi diagram based on the final K-Means cluster centers
vor = Voronoi(centers)
# Plot the Voronoi diagram boundaries (the "tessellation" or decision boundaries)
voronoi_plot_2d(vor, ax=plt.gca(), show_points=False, show_vertices=False, linestyle='--', color='gray', linewidth=2)
# Plot the original data points, colored by their assigned cluster
plt.scatter(df['reading score'], df['writing score'],
c=df['cluster_labels'], cmap='viridis',
s=50, alpha=0.8, edgecolor='k', label='Data Points')
# Plot the cluster centers (anchors) as large markers
plt.scatter(centers[:, 0], centers[:, 1],
marker='X', s=200, color='red',
edgecolors='black', linewidths=2, label='Cluster Anchors')
plt.title(f"K-Means Clustering (K={K}) with Voronoi Tessellation Boundaries")
plt.xlabel("Reading Score")
plt.ylabel("Writing Score")
plt.legend()
plt.xlim(X[:, 0].min()-2, X[:, 0].max()+2)
plt.ylim(X[:, 1].min()-2, X[:, 1].max()+2)
plt.savefig("fig5.png", dpi=300, bbox_inches='tight', transparent=True)
plt.show()
Cluster Centers: [[79.52273561 80.70031026] [40.52452521 80.09126579] [59.16375026 59.3376695 ]]
In [44]:
from sklearn.mixture import GaussianMixture
# Load or Generate Example Data
data = df['math score'].dropna()
# Format Data for Scikit-learn
# Reshape the single column into a 2D array:
X = data.values.reshape(-1, 1)
# Apply Expectation-Maximization (GaussianMixture)
K = 3
gmm = GaussianMixture(n_components=K, random_state=42, init_params='kmeans')
# Fit the model to the data
gmm.fit(X)
# Predict the cluster assignments (hard labels) and responsibilities
df['gmm_labels'] = gmm.predict(X)
probabilities = gmm.predict_proba(X) # The 'latent variables' in the E-step
print(f"Fitted Means (μ): \n{gmm.means_.flatten()}")
print(f"Fitted Standard Deviations (σ): \n{np.sqrt(gmm.covariances_).flatten()}")
print(f"Cluster Weights/Priors: \n{gmm.weights_.flatten()}")
print("\nData points per most likely cluster:")
print(df['gmm_labels'].value_counts().sort_index())
# isualize the Results
plt.figure(figsize=(10, 6))
# Plot the histogram
plt.hist(data, bins=30, density=True, alpha=0.5, color='gray', label='Actual Data Histogram')
# Plot the individual fitted Gaussian components and the total mixture density
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
logprob = gmm.score_samples(x.reshape(-1, 1))
pdf = np.exp(logprob)
plt.plot(x, pdf, '-k', linewidth=2, label='Total GMM Density')
# Plot each component
for i in range(K):
# Calculate the PDF for component 'i', weighted by its probability (gmm.weights_[i])
component_pdf = gmm.weights_[i] * (1 / (np.sqrt(2 * np.pi * gmm.covariances_[i][0]) * gmm.weights_[i])) * np.exp(-(x - gmm.means_[i][0])**2 / (2 * gmm.covariances_[i][0]))
plt.plot(x, component_pdf, '--', label=f'Component {i+1}')
plt.title(f"E-M Clustering using Gaussian Mixture Model (K={K})")
plt.xlabel("Math Score")
plt.ylabel("Density")
plt.legend()
plt.show()
Fitted Means (μ): [70.40285703 89.55344804 49.20293563] Fitted Standard Deviations (σ): [3.80376891 5.31812677 4.89941658] Cluster Weights/Priors: [0.31603455 0.33916601 0.34479944] Data points per most likely cluster: gmm_labels 0 48 1 50 2 52 Name: count, dtype: int64
In [ ]: