#Code generated from Gemini
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

# 1. Load the dataset
try:
    df = pd.read_csv('datasets/Dataset salary 2024.csv')
except FileNotFoundError:
    print("Error: 'Dataset salary 2024.csv' not found. Please ensure the file is correctly uploaded.")
    exit()

#Data Cleaning and preprpcessing
# Filter out the row with missing Age (which is represented by a grave accent `)
df = df[df['Age'] != '`']

# Convert Age to numeric, forcing errors to NaN if any new non-numeric values appear
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

# Drop rows with any remaining missing values for simplicity in this analysis
df.dropna(inplace=True)

# Define the target variable (we won't use it for PCA, but it's good practice)
# We will use 'salary_in_usd' as the numerical feature to analyze
numerical_features = ['salary_in_usd', 'Age', 'remote_ratio']
categorical_features = ['experience_level', 'employment_type', 'job_title',
                        'employee_residence', 'company_location', 'company_size']

# Separate features (X) from any potential target (though PCA is unsupervised)
X = df[numerical_features + categorical_features].copy()

# 2. Preprocessing Pipeline (Standardization and One-Hot Encoding)
# We use a ColumnTransformer to apply different preprocessing steps to different columns

# Create transformers for numerical and categorical data
numerical_transformer = StandardScaler() 
categorical_transformer = OneHotEncoder(handle_unknown='ignore') # Converting categories to numbers

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop' # Dropping any columns not specified above
)

# 3. Creating the PCA Pipeline
n_components_to_fit = 10
pca_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=n_components_to_fit))
])

# Fit the pipeline to the data (this transforms and fits the PCA model)
X_pca_transformed = pca_pipeline.fit_transform(X)

# Extract the fitted PCA model and the variance data
pca_model = pca_pipeline['pca']
explained_variance_ratio = pca_model.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance_ratio)

# 4. Visualization of Explained Variance (Scree Plot)
plt.figure(figsize=(10, 5))
plt.plot(range(1, n_components_to_fit + 1), explained_variance_ratio, marker='o', linestyle='--', label='Individual Variance')
plt.plot(range(1, n_components_to_fit + 1), cumulative_variance, marker='o', linestyle='-', color='red', label='Cumulative Variance')
plt.title('Explained Variance by Principal Component')
plt.xlabel('Principal Component Number')
plt.ylabel('Proportion of Variance Explained')
plt.xticks(range(1, n_components_to_fit + 1))
plt.grid(True)
plt.legend()
plt.show()
print(f"\nCumulative Variance Explained by First {n_components_to_fit} Components: {cumulative_variance[-1]*100:.2f}%")

# 5. Visualization of Data in the First Two Principal Components (2D Plot)
# Use 'experience_level' to color-code the plot to see if PC1/PC2 separate experience groups
experience_colors = df['experience_level'].astype('category').cat.codes

plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    X_pca_transformed[:, 0], # First Principal Component (PC1)
    X_pca_transformed[:, 1], # Second Principal Component (PC2)
    c=experience_colors,
    cmap='viridis',
    s=15,
    alpha=0.6
)
plt.title('Salary Data Projected onto PC1 and PC2')
plt.xlabel(f'Principal Component 1 ({explained_variance_ratio[0]*100:.2f}% of Variance)')
plt.ylabel(f'Principal Component 2 ({explained_variance_ratio[1]*100:.2f}% of Variance)')

# Add a legend for the color coding
legend1 = plt.legend(*scatter.legend_elements(),
                    loc="lower left", title="Experience Level")
plt.gca().add_artist(legend1)

plt.grid(True, linestyle='--')
plt.show()

# 6. Interpretation of the First Component (Optional but informative)
# The PCA component weights indicate which original features contribute most to that component.
# This part requires accessing the feature names after One-Hot Encoding.

# Get the feature names after preprocessing
feature_names = list(preprocessor.named_transformers_['num'].get_feature_names_out(numerical_features))
feature_names.extend(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))

# Get the weights (loadings) of the first component
loadings = pca_model.components_[0]

# Create a series of features and their loadings for PC1
pc1_loadings = pd.Series(loadings, index=feature_names).sort_values(ascending=False)

print("\n--- Top 5 Features Contributing to Principal Component 1 ---")
print(pc1_loadings.head(5))
print("\n--- Bottom 5 Features Contributing to Principal Component 1 ---")
print(pc1_loadings.tail(5))

Cumulative Variance Explained by First 10 Components: 87.32%

--- Top 5 Features Contributing to Principal Component 1 ---
salary_in_usd                          0.844742
experience_level_SE                    0.148562
employee_residence_US                  0.112006
company_location_US                    0.107617
job_title_Machine Learning Engineer    0.058138
dtype: float64

--- Bottom 5 Features Contributing to Principal Component 1 ---
experience_level_EN      -0.064008
job_title_Data Analyst   -0.087230
experience_level_MI      -0.097495
Age                      -0.098933
remote_ratio             -0.447200
dtype: float64

Assignment 7: Transformation¶

Learning about PCA and how it works: I have Learnt the following infomration from Gemini¶

Understading Professors code from ChatGPT¶

My data transform analysis: Explaining the most variance in salary.¶

Analysis¶