import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# --- STEP 1: LOAD THE DATA ---
# We load the dataset and convert it to a DataFrame for easier viewing
data = load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target # These are the wine categories (0, 1, 2)

print("### PROFESSOR'S NOTE: RAW DATA STATISTICS ###")
print("Notice the massive difference in scale between 'proline' and 'nonflavanoid_phenols':")
print(df[['proline', 'nonflavanoid_phenols']].describe().loc[['mean', 'std']])
print("-" * 50)


# --- STEP 2: STANDARDIZATION ---
# This brings all features to Mean=0, Std=1
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)

# Let's verify the standardization worked
df_scaled = pd.DataFrame(X_scaled, columns=df.columns)
print("\n### PROFESSOR'S NOTE: STANDARDIZED DATA ###")
print("Notice how the means are now effectively 0 and std is 1:")
print(df_scaled[['proline', 'nonflavanoid_phenols']].describe().loc[['mean', 'std']].round(2))
print("-" * 50)


# --- STEP 3: APPLY PCA ---
# We compress the 13 columns down to just 2 Principal Components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Calculate how much information (variance) we preserved
explained_variance = pca.explained_variance_ratio_
total_variance = sum(explained_variance) * 100

print(f"\n### PROFESSOR'S NOTE: PCA RESULTS ###")
print(f"PC1 explains: {explained_variance[0]:.2%} of the variance")
print(f"PC2 explains: {explained_variance[1]:.2%} of the variance")
print(f"Together, we preserved {total_variance:.2f}% of the information using only 2 dimensions!")


# --- STEP 4: VISUALIZATION ---
plt.figure(figsize=(10, 8))
sns.scatterplot(
    x=X_pca[:, 0], 
    y=X_pca[:, 1], 
    hue=data.target_names[y], # Color by wine variety
    palette='viridis',
    s=100
)

plt.title(f'PCA of Wine Dataset (Preserved Variance: {total_variance:.2f}%)', fontsize=15)
plt.xlabel(f'Principal Component 1 ({explained_variance[0]:.2%})', fontsize=12)
plt.ylabel(f'Principal Component 2 ({explained_variance[1]:.2%})', fontsize=12)
plt.axhline(0, color='grey', linestyle='--', linewidth=0.8)
plt.axvline(0, color='grey', linestyle='--', linewidth=0.8)
plt.legend(title='Wine Cultivar')
plt.grid(True, alpha=0.3)

plt.show()

### PROFESSOR'S NOTE: RAW DATA STATISTICS ###
Notice the massive difference in scale between 'proline' and 'nonflavanoid_phenols':
         proline  nonflavanoid_phenols
mean  746.893258              0.361854
std   314.907474              0.124453
--------------------------------------------------

### PROFESSOR'S NOTE: STANDARDIZED DATA ###
Notice how the means are now effectively 0 and std is 1:
      proline  nonflavanoid_phenols
mean     -0.0                  -0.0
std       1.0                   1.0
--------------------------------------------------

### PROFESSOR'S NOTE: PCA RESULTS ###
PC1 explains: 36.20% of the variance
PC2 explains: 19.21% of the variance
Together, we preserved 55.41% of the information using only 2 dimensions!

# Here is the Python script to perform the analysis we discussed. I have annotated the code heavily so you can follow the logic as if I were standing right there at the whiteboard.

# This script uses scikit-learn (the industry standard for machine learning in Python) along with pandas for data handling and matplotlib for visualization.

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# --- STEP 1: LOAD THE DATA ---
# We load the dataset and convert it to a DataFrame for easier viewing
data = load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target # These are the wine categories (0, 1, 2)

print("### PROFESSOR'S NOTE: RAW DATA STATISTICS ###")
print("Notice the massive difference in scale between 'proline' and 'nonflavanoid_phenols':")
print(df[['proline', 'nonflavanoid_phenols']].describe().loc[['mean', 'std']])
print("-" * 50)


# --- STEP 2: STANDARDIZATION ---
# This brings all features to Mean=0, Std=1
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)

# Let's verify the standardization worked
df_scaled = pd.DataFrame(X_scaled, columns=df.columns)
print("\n### PROFESSOR'S NOTE: STANDARDIZED DATA ###")
print("Notice how the means are now effectively 0 and std is 1:")
print(df_scaled[['proline', 'nonflavanoid_phenols']].describe().loc[['mean', 'std']].round(2))
print("-" * 50)


# --- STEP 3: APPLY PCA ---
# We compress the 13 columns down to just 2 Principal Components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Calculate how much information (variance) we preserved
explained_variance = pca.explained_variance_ratio_
total_variance = sum(explained_variance) * 100

print(f"\n### PROFESSOR'S NOTE: PCA RESULTS ###")
print(f"PC1 explains: {explained_variance[0]:.2%} of the variance")
print(f"PC2 explains: {explained_variance[1]:.2%} of the variance")
print(f"Together, we preserved {total_variance:.2f}% of the information using only 2 dimensions!")


# --- STEP 4: VISUALIZATION ---
plt.figure(figsize=(10, 8))
sns.scatterplot(
    x=X_pca[:, 0], 
    y=X_pca[:, 1], 
    hue=data.target_names[y], # Color by wine variety
    palette='viridis',
    s=100
)

plt.title(f'PCA of Wine Dataset (Preserved Variance: {total_variance:.2f}%)', fontsize=15)
plt.xlabel(f'Principal Component 1 ({explained_variance[0]:.2%})', fontsize=12)
plt.ylabel(f'Principal Component 2 ({explained_variance[1]:.2%})', fontsize=12)
plt.axhline(0, color='grey', linestyle='--', linewidth=0.8)
plt.axvline(0, color='grey', linestyle='--', linewidth=0.8)
plt.legend(title='Wine Cultivar')
plt.grid(True, alpha=0.3)

plt.show()

# Create a DataFrame for the "Loadings" (The Recipe)
loadings = pd.DataFrame(
    pca.components_.T,  # Transpose the matrix
    columns=['PC1', 'PC2'], 
    index=df.columns
)

print("### PROFESSOR'S ANALYSIS: The 'Recipe' for PC1 ###")
# We sort by PC1 to see the strongest drivers
print(loadings['PC1'].sort_values(ascending=False))

### PROFESSOR'S NOTE: RAW DATA STATISTICS ###
Notice the massive difference in scale between 'proline' and 'nonflavanoid_phenols':
         proline  nonflavanoid_phenols
mean  746.893258              0.361854
std   314.907474              0.124453
--------------------------------------------------

### PROFESSOR'S NOTE: STANDARDIZED DATA ###
Notice how the means are now effectively 0 and std is 1:
      proline  nonflavanoid_phenols
mean     -0.0                  -0.0
std       1.0                   1.0
--------------------------------------------------

### PROFESSOR'S NOTE: PCA RESULTS ###
PC1 explains: 36.20% of the variance
PC2 explains: 19.21% of the variance
Together, we preserved 55.41% of the information using only 2 dimensions!

### PROFESSOR'S ANALYSIS: The 'Recipe' for PC1 ###
flavanoids                      0.422934
total_phenols                   0.394661
od280/od315_of_diluted_wines    0.376167
proanthocyanins                 0.313429
hue                             0.296715
proline                         0.286752
alcohol                         0.144329
magnesium                       0.141992
ash                            -0.002051
color_intensity                -0.088617
alcalinity_of_ash              -0.239320
malic_acid                     -0.245188
nonflavanoid_phenols           -0.298533
Name: PC1, dtype: float64

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# 1. Load and Standardize (Same as before)
data = load_wine()
X = data.data
X_scaled = StandardScaler().fit_transform(X)

# 2. Fit PCA with ALL components instead of just 2
pca_all = PCA()
pca_all.fit(X_scaled)

# 3. Extract Variance Data
variance_ratios = pca_all.explained_variance_ratio_
cumulative_variance = np.cumsum(variance_ratios)

# 4. Plotting
plt.figure(figsize=(12, 6))

# Plot 1: The Scree Plot (Individual Variance)
plt.bar(
    range(1, 14), 
    variance_ratios, 
    alpha=0.7, 
    label='Individual Variance (Scree)',
    color='#4c72b0'
)

# Plot 2: Cumulative Variance (Step Plot)
plt.step(
    range(1, 14), 
    cumulative_variance, 
    where='mid', 
    label='Cumulative Variance',
    color='#c44e52',
    linewidth=2
)

# Visual Guide Lines
plt.axhline(y=0.90, color='grey', linestyle='--', label='90% Threshold')
plt.axhline(y=0.80, color='grey', linestyle=':', label='80% Threshold')

plt.title('Scree Plot & Cumulative Variance (Wine Dataset)', fontsize=16)
plt.xlabel('Principal Component Index', fontsize=12)
plt.ylabel('Explained Variance Ratio', fontsize=12)
plt.xticks(range(1, 14))
plt.legend(loc='center right')
plt.grid(True, alpha=0.3)

plt.show()

# Print exact numbers for decision making
print("### PROFESSOR'S ANALYSIS ###")
for i, (var, cum_var) in enumerate(zip(variance_ratios, cumulative_variance)):
    print(f"PC{i+1}: Explains {var:.2%} | Cumulative: {cum_var:.2%}")

### PROFESSOR'S ANALYSIS ###
PC1: Explains 36.20% | Cumulative: 36.20%
PC2: Explains 19.21% | Cumulative: 55.41%
PC3: Explains 11.12% | Cumulative: 66.53%
PC4: Explains 7.07% | Cumulative: 73.60%
PC5: Explains 6.56% | Cumulative: 80.16%
PC6: Explains 4.94% | Cumulative: 85.10%
PC7: Explains 4.24% | Cumulative: 89.34%
PC8: Explains 2.68% | Cumulative: 92.02%
PC9: Explains 2.22% | Cumulative: 94.24%
PC10: Explains 1.93% | Cumulative: 96.17%
PC11: Explains 1.74% | Cumulative: 97.91%
PC12: Explains 1.30% | Cumulative: 99.20%
PC13: Explains 0.80% | Cumulative: 100.00%

Transforms¶

Data transformation¶

Key Aspects of Data Transformation:¶

Principal Component Analysis¶

Key benefits¶

Data standardization¶

Example¶

Prompt: Explain: the meaning of the chart, what components are PC1 and PC2 and what are and how to interpret the loading scores¶

Let's Reveal the Recipe (The Code)¶

Next Step¶

Prompt explain the "Scree Plot" and the "Cumulative Variance" rule to scientifically determine the perfect number of components to keep¶

Interpretation of the Output¶

Learning Points¶