import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np

# This code makes a gamma distribution of the dataset 
# 1. Load Dataset
df = sns.load_dataset('diamonds')
data = df['carat']
df.head()

import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np

# This code makes a gamma distribution of the dataset 
# 1. Load Dataset
df = sns.load_dataset('diamonds')
data = df['carat']
df.head()


# 2. Visualize the Histogram
sns.histplot(data, stat="density", bins=50, color='skyblue', label='Actual Data')

# 3. Fit a Distribution (e.g., Log-Normal or Gamma)
# We fit a Gamma distribution here as an example
shape, loc, scale = stats.gamma.fit(data)

# 4. Plot the PDF (Probability Density Function)
x = np.linspace(0, data.max(), 100)
pdf_fitted = stats.gamma.pdf(x, shape, loc, scale)
plt.plot(x, pdf_fitted, 'r-', lw=3, label='Fitted Gamma Distribution')

plt.legend()
plt.show()

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.cluster import KMeans

# 1. Load Dataset
df = sns.load_dataset('diamonds')

# 2. Preprocessing
# We select features that define the diamond's value and physical properties
features_num = ['carat', 'depth', 'table', 'price', 'x', 'y', 'z']
features_cat = ['cut', 'color', 'clarity']

# Encode categorical data (Ordinal is best here because there is a hierarchy)
# We map them manually or use OrdinalEncoder. 
# For simplicity in clustering, we often trust the algorithm to find patterns in the raw numeric data, 
# but let's include encoded versions for a robust model.
encoder = OrdinalEncoder()
df_encoded = df.copy()
df_encoded[features_cat] = encoder.fit_transform(df[features_cat])

# Combine all features for clustering
X = df_encoded[features_num + features_cat]

# Scale the data (CRITICAL step for K-Means)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Determine optimal clusters (Elbow Method)
# Let's test a range of clusters to see where the "elbow" is
inertia = []
range_clusters = range(1, 11)

for k in range_clusters:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot Elbow Curve
plt.figure(figsize=(8, 4))
plt.plot(range_clusters, inertia, 'bo-')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia (Distortion)')
plt.title('Elbow Method for Optimal k')
plt.show()

# 4. Apply K-Means
# Based on the typical elbow for this dataset, k=3 or k=4 often works well.
# Let's try k=3 to see if we get Small/Cheap, Medium, and Large/Expensive groups.
k = 3
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

# Add cluster labels back to original dataframe for interpretation
df['cluster'] = clusters

# 5. Visualize the Results
# Plotting Carat vs Price is the most intuitive way to see the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='carat', y='price', hue='cluster', palette='viridis', alpha=0.6)
plt.title(f'K-Means Clustering (k={k}) of Diamonds')
plt.show()

# 6. Interpret the Clusters
print(df.groupby('cluster')[['carat', 'price', 'cut']].agg({
    'carat': 'mean',
    'price': 'mean',
    'cut': lambda x: x.mode()[0]  # Most common cut in the cluster
}))


# --- NEW SECTION: KERNEL DENSITY VISUALIZATION ---

# Set up the figure
plt.figure(figsize=(15, 6))

# VISUALIZATION 1: 2D KDE (Contour Plot)
# This shows the "shape" and "peak" of each cluster in 2D space
plt.subplot(1, 2, 1)
sns.kdeplot(
    data=df, 
    x='x', 
    y='y', 
    hue='cluster', 
    palette='viridis', 
    fill=True,       # Fills the contours with color
    alpha=0.5,       # Transparency to see overlaps
    levels=10,       # Number of contour lines
    thresh=0.05      # Filter out lowest density areas for cleaner look
)
plt.title('2D Density of Clusters (Length vs Width)')
plt.xlabel('Length (mm)')
plt.ylabel('Width (mm)')

# VISUALIZATION 2: 1D KDE (Distribution Curves)
# This shows how well separated the clusters are by size
plt.subplot(1, 2, 2)
sns.kdeplot(
    data=df, 
    x='x', 
    hue='cluster', 
    palette='viridis',
    fill=True,
    common_norm=False, # Calculate density independently for each cluster
    alpha=0.4
)
plt.title('1D Density Distribution by Length (x)')
plt.xlabel('Length (mm)')

plt.tight_layout()
plt.show()

            carat         price      cut
cluster                                 
0        1.705259  12003.031215  Premium
1        0.404009   1080.434068    Ideal
2        0.952164   4550.000554    Ideal

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# 1. Load Dataset
df = sns.load_dataset('diamonds')

# Filter out invalid data (diamonds with 0 dimensions)
df = df[(df[['x', 'y', 'z']] > 0).all(axis=1)]

# 2. Preprocessing
features = ['x', 'y', 'z']
X = df[features]

# Standardize the data (Mean=0, Variance=1)
# This is crucial for K-Means so that all dimensions have equal weight
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Apply K-Means Clustering
# We use k=3 to group them into Small, Medium, and Large sizes
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X_scaled)

# 4. Create 3D Visualization
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot
# alpha=0.3 helps visualize the density in the cloud
scatter = ax.scatter(df['x'], df['y'], df['z'], 
                     c=df['cluster'], cmap='viridis', s=5, alpha=0.3)

# Labeling
ax.set_xlabel('Length (x)')
ax.set_ylabel('Width (y)')
ax.set_zlabel('Depth (z)')
ax.set_title('3D Clustering of Diamonds by Size')

# Add legend
legend = ax.legend(*scatter.legend_elements(), title="Cluster")
ax.add_artist(legend)

# Adjust viewing angle (optional)
ax.view_init(elev=20, azim=45)

plt.show()

# 5. Interpret the Clusters
print("Cluster Centers (Physical Dimensions):")
print(df.groupby('cluster')[['x', 'y', 'z']].mean())

Cluster Centers (Physical Dimensions):
                x         y         z
cluster                              
0        6.241590  6.241061  3.857639
1        4.700884  4.709485  2.901731
2        7.586614  7.583424  4.683883

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# 1. Load Dataset
df = sns.load_dataset('diamonds')

# --- FIX 1: Remove "Impossible" Outliers ---
# The dataset contains errors where x, y, or z are 0, or absurdly large (e.g., y > 30).
# These outliers squash the plot, making normal points look like a flat line at 0.
df = df[(df[['x', 'y', 'z']] > 0).all(axis=1)]  # Remove zeros
df = df[(df['y'] < 30) & (df['z'] < 30)]         # Remove massive outlier errors

# 2. Preprocessing
features = ['x', 'y', 'z']
X = df[features]

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 3. Apply K-Means
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X_scaled)

# 4. Create 3D Visualization
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot
scatter = ax.scatter(df['x'], df['y'], df['z'], 
                     c=df['cluster'], cmap='viridis', s=5, alpha=0.3)

# --- FIX 2 & 3: Fix Label Visibility and "Flat" Look ---
ax.set_xlabel('Length (x)')
ax.set_ylabel('Width (y)')
ax.set_zlabel('Depth (z)', labelpad=10) # labelpad pushes the label out so it's visible

ax.set_title('3D Clustering of Diamonds by Size (Outliers Removed)')

# Set the aspect ratio of the 3D box to be cubic (1,1,1)
# This prevents the z-axis from looking "flat" compared to x and y
ax.set_box_aspect((1, 1, 1))

# Legend
legend = ax.legend(*scatter.legend_elements(), title="Cluster")
ax.add_artist(legend)

# Adjust viewing angle for a better perspective
ax.view_init(elev=30, azim=120)

plt.show()

# 5. Interpret
print("Cluster Centers (Physical Dimensions):")
print(df.groupby('cluster')[['x', 'y', 'z']].mean())

Cluster Centers (Physical Dimensions):
                x         y         z
cluster                              
0        6.240336  6.239812  3.856876
1        4.700708  4.709296  2.901609
2        7.584314  7.571264  4.678191

# Useing Neils codes 

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

#
# 1. Load and Prepare Data
#
# We use the seaborn 'diamonds' dataset and select x, y, z
diamonds = sns.load_dataset('diamonds')

# Sampling 3000 points to keep execution fast and "similar" to your npts
data = diamonds[['x', 'y', 'z']].sample(n=3000, random_state=0)

# Filter out zero values (diamonds can't have 0 dimensions)
data = data[(data[['x', 'y', 'z']] > 0).all(axis=1)]

x = data['x'].values
y = data['y'].values
z = data['z'].values

#
# 2. GMM Parameters
#
npts = len(x)
nclusters = 3  # Trying to find 3 groups of diamond sizes
nsteps = 30
np.random.seed(42)

#
# 3. Choose starting points and initialize
#
indices = np.random.uniform(low=0, high=npts, size=nclusters).astype(int)

# Initial centers (means)
mux = x[indices]
muy = y[indices]
muz = z[indices]

# Initial variances (spread) - initially very large
varx = (np.max(x)-np.min(x))**2
vary = (np.max(y)-np.min(y))**2
varz = (np.max(z)-np.min(z))**2

# Initial weights (priors) - uniform 1/3
pc = np.ones(nclusters)/nclusters

#
# 4. Plot before iteration (3D)
#
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, s=1, alpha=0.3, label='Data')
ax.scatter(mux, muy, muz, c='red', s=200, label='Initial Centers')
ax.set_title('Before Iteration (Initial Random Guess)')
ax.set_xlabel('X (Length)')
ax.set_ylabel('Y (Width)')
ax.set_zlabel('Z (Depth)')
plt.legend()
plt.show()

#
# 5. Do E-M Iterations
#
for i in range(nsteps):
    # --- Construct Matrices (Broadcasting) ---
    # Data matrices (repeated for each cluster)
    xm = np.outer(x, np.ones(nclusters))
    ym = np.outer(y, np.ones(nclusters))
    zm = np.outer(z, np.ones(nclusters))
    
    # Mean matrices (repeated for each data point)
    muxm = np.outer(np.ones(npts), mux)
    muym = np.outer(np.ones(npts), muy)
    muzm = np.outer(np.ones(npts), muz)
    
    # Variance matrices (repeated for each data point)
    # Note: Corrected the bug here. Each dimension uses its own variance.
    varxm = np.outer(np.ones(npts), varx)
    varym = np.outer(np.ones(npts), vary)
    varzm = np.outer(np.ones(npts), varz)
    
    # --- E-Step: Use model to update probabilities ---
    
    # Calculate Gaussian probability for X, Y, and Z independently
    # P(x|c)
    px = (1/np.sqrt(2*np.pi*varxm)) * np.exp(-(xm-muxm)**2/(2*varxm))
    # P(y|c)
    py = (1/np.sqrt(2*np.pi*varym)) * np.exp(-(ym-muym)**2/(2*varym))
    # P(z|c)
    pz = (1/np.sqrt(2*np.pi*varzm)) * np.exp(-(zm-muzm)**2/(2*varzm))
    
    # Joint probability P(x,y,z|c) = P(x)*P(y)*P(z)
    pvgc = px * py * pz
    
    # Multiply by prior P(c)
    pvc = pvgc * np.outer(np.ones(npts), pc)
    
    # Normalize to get posterior probability P(c|x,y,z)
    # Adding 1e-10 to avoid division by zero
    pcgv = pvc / (np.outer(np.sum(pvc, 1), np.ones(nclusters)) + 1e-10)
    
    # --- M-Step: Use probabilities to update model ---
    
    # 1. Update Weights (pc)
    pc = np.sum(pcgv, 0) / npts
    
    # 2. Update Means (mux, muy, muz) - Weighted Average
    mux = np.sum(xm * pcgv, 0) / (npts * pc)
    muy = np.sum(ym * pcgv, 0) / (npts * pc)
    muz = np.sum(zm * pcgv, 0) / (npts * pc)
    
    # 3. Update Variances (varx, vary, varz) - Weighted Spread
    # Adding small epsilon (1e-6) to prevent variance from hitting 0
    varx = 1e-6 + np.sum((xm - muxm)**2 * pcgv, 0) / (npts * pc)
    vary = 1e-6 + np.sum((ym - muym)**2 * pcgv, 0) / (npts * pc)
    varz = 1e-6 + np.sum((zm - muzm)**2 * pcgv, 0) / (npts * pc)

#
# 6. Plot after iteration
#
# Assign each point to the cluster with the highest probability
labels = np.argmax(pcgv, axis=1)

fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111, projection='3d')
# Plot scatter colored by the cluster label
scatter = ax.scatter(x, y, z, c=labels, cmap='viridis', s=5, alpha=0.6)
# Plot the final centers
ax.scatter(mux, muy, muz, c='red', s=300, marker='*', label='Final Centers')

ax.set_title(f'After {nsteps} Iterations (Clustered by Size)')
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
plt.legend()
plt.show()

# Print final parameters for inspection
print("Final Cluster Means (Centroids):")
for i in range(nclusters):
    print(f"Cluster {i}: X={mux[i]:.2f}, Y={muy[i]:.2f}, Z={muz[i]:.2f}")

Final Cluster Means (Centroids):
Cluster 0: X=6.24, Y=6.24, Z=3.86
Cluster 1: X=4.71, Y=4.72, Z=2.90
Cluster 2: X=7.52, Y=7.50, Z=4.63

	carat	cut	color	clarity	depth	table	price	x	y	z
0	0.23	Ideal	E	SI2	61.5	55.0	326	3.95	3.98	2.43
1	0.21	Premium	E	SI1	59.8	61.0	326	3.89	3.84	2.31
2	0.23	Good	E	VS1	56.9	65.0	327	4.05	4.07	2.31
3	0.29	Premium	I	VS2	62.4	58.0	334	4.20	4.23	2.63
4	0.31	Good	J	SI2	63.3	58.0	335	4.34	4.35	2.75

Density Estimation¶

Dataset¶

Choosing Dataset¶

Clustering¶

Prompt: Can you make a clustering analysis of the diamond dataset¶

1. Preprocessing Strategy¶

Clustering in 3D by size¶

There are several problems with this code:¶

Learning Points¶