Philippe Libioulle - Fab Futures - Data Science
Home About

< Previous dataset - Week 4 home - Next dataset>

Week 4: machine learning - "AI impact on jobs by 2030" dataset¶

Context¶

  • Source: Kaggle
  • Description: this dataset simulates the future of work in the age of artificial intelligence. It models how various professions, skills, and education levels might be impacted by AI-driven automation by the year 2030.

Load dataset¶

In [1]:
import pandas as pd 
import seaborn as sb
import matplotlib.pyplot as plt

df = pd.read_csv("datasets/AI_Impact_on_Jobs_2030.csv")

# 🧾 Display dataset informations
print("Dataset shape:", df.shape)
#print(df.info)
Dataset shape: (3000, 18)

Explore content¶

In [2]:
df.head()
Out[2]:
Job_Title Average_Salary Years_Experience Education_Level AI_Exposure_Index Tech_Growth_Factor Automation_Probability_2030 Risk_Category Skill_1 Skill_2 Skill_3 Skill_4 Skill_5 Skill_6 Skill_7 Skill_8 Skill_9 Skill_10
0 Security Guard 45795 28 Master's 0.18 1.28 0.85 High 0.45 0.10 0.46 0.33 0.14 0.65 0.06 0.72 0.94 0.00
1 Research Scientist 133355 20 PhD 0.62 1.11 0.05 Low 0.02 0.52 0.40 0.05 0.97 0.23 0.09 0.62 0.38 0.98
2 Construction Worker 146216 2 High School 0.86 1.18 0.81 High 0.01 0.94 0.56 0.39 0.02 0.23 0.24 0.68 0.61 0.83
3 Software Engineer 136530 13 PhD 0.39 0.68 0.60 Medium 0.43 0.21 0.57 0.03 0.84 0.45 0.40 0.93 0.73 0.33
4 Financial Analyst 70397 22 High School 0.52 1.46 0.64 Medium 0.75 0.54 0.59 0.97 0.61 0.28 0.30 0.17 0.02 0.42

Experiment 1 - Let's try to update by myself the code generated by Claude AI for the bank loan dataset...¶

Here is the code¶

In [3]:
import jax
import jax.numpy as jnp
from jax import grad, jit
import csv

# Load and preprocess data
# [Philippe] Still the same question here: why do we get rid of the non-numerical features ?? 

def load_data(filename, max_rows=5000):
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        rows = list(reader)[:max_rows]  # Limit data for memory
   
    # Select numerical features
    feature_names = ['Average_Salary', 'Years_Experience', 
                     'AI_Exposure_Index', 'Tech_Growth_Factor', 'Skill_1',
                     'Skill_2', 'Skill_3', 'Skill_4',
                     'Skill_5', 'Skill_6', 'Skill_7',
                     'Skill_8', 'Skill_9', 'Skill_10']
   
    X = []
    y = []
    for row in rows:
        features = [float(row[name]) for name in feature_names]
        X.append(features)
        y.append(float(row['Automation_Probability_2030']))
   
    return jnp.array(X), jnp.array(y)

# Normalize features
def normalize(X):
    mean = jnp.mean(X, axis=0)
    std = jnp.std(X, axis=0) + 1e-8
    return (X - mean) / std

# Initialize network parameters
# [Philippe] In this code (when called), we will have 4 layers: 1 input (14 features => 14 neurons), 1 hidden (15 features, why one more ??), another hidden (8 neurons) and then an output with 1 neuron

def init_network(layer_sizes, key):
    params = []
    for i in range(len(layer_sizes) - 1):
        key, subkey = jax.random.split(key)
        w = jax.random.normal(subkey, (layer_sizes[i], layer_sizes[i+1])) * 0.1
        b = jnp.zeros(layer_sizes[i+1])
        params.append((w, b))
    return params

# Forward pass
# [Philippe] 

def forward(params, x):
    for i, (w, b) in enumerate(params[:-1]):
        x = jnp.tanh(jnp.dot(x, w) + b)
    w, b = params[-1]
    return jnp.dot(x, w) + b

# Sigmoid activation
def sigmoid(x):
    return 1 / (1 + jnp.exp(-x))

# Binary cross-entropy loss
def loss_fn(params, x, y):
    logits = forward(params, x)
    probs = sigmoid(logits.squeeze())
    return -jnp.mean(y * jnp.log(probs + 1e-8) + (1 - y) * jnp.log(1 - probs + 1e-8))

# Prediction function
def predict(params, x):
    logits = forward(params, x)
    return (sigmoid(logits.squeeze()) > 0.5).astype(jnp.float32)

# Training step
@jit
def train_step(params, x, y, lr):
    loss, grads = jax.value_and_grad(loss_fn)(params, x, y)
    params = [(w - lr * dw, b - lr * db) for (w, b), (dw, db) in zip(params, grads)]
    return params, loss

# Main training loop
def train(X, y, layer_sizes=[14, 15, 8, 1], epochs=500, lr=0.01, batch_size=64):
    key = jax.random.PRNGKey(42)
   
    # Normalize data
    X_norm = normalize(X)
   
    # Split data (80/20)
    n_train = int(0.8 * len(X))
    X_train, X_test = X_norm[:n_train], X_norm[n_train:]
    y_train, y_test = y[:n_train], y[n_train:]
   
    # Initialize network
    params = init_network(layer_sizes, key)
   
    # Training loop
    n_batches = len(X_train) // batch_size
   
    for epoch in range(epochs):
        # Shuffle data
        key, subkey = jax.random.split(key)
        perm = jax.random.permutation(subkey, len(X_train))
        X_train_shuffled = X_train[perm]
        y_train_shuffled = y_train[perm]
       
        for i in range(n_batches):
            start = i * batch_size
            end = start + batch_size
            X_batch = X_train_shuffled[start:end]
            y_batch = y_train_shuffled[start:end]
           
            params, batch_loss = train_step(params, X_batch, y_batch, lr)
       
        if (epoch + 1) % 50 == 0:
            train_loss = loss_fn(params, X_train, y_train)
            test_loss = loss_fn(params, X_test, y_test)
           
            train_preds = predict(params, X_train)
            test_preds = predict(params, X_test)
            train_acc = jnp.mean(train_preds == y_train)
            test_acc = jnp.mean(test_preds == y_test)
           
            print(f"Epoch {epoch+1}/{epochs}")
            print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
            print(f"  Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
   
    return params

if __name__ == "__main__":
    # Load data
    print("Loading data...")
    X, y = load_data("datasets/AI_Impact_on_Jobs_2030.csv")
    print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")
   
    # Train model
    print("\nTraining neural network...")
    params = train(X, y)
   
    print("\nTraining complete!")
Loading data...
Dataset: 3000 samples, 14 features

Training neural network...
Epoch 50/500
  Train Loss: 0.6930, Train Acc: 0.0000
  Test Loss: 0.6937, Test Acc: 0.0000
Epoch 100/500
  Train Loss: 0.6927, Train Acc: 0.0000
  Test Loss: 0.6940, Test Acc: 0.0000
Epoch 150/500
  Train Loss: 0.6925, Train Acc: 0.0000
  Test Loss: 0.6944, Test Acc: 0.0000
Epoch 200/500
  Train Loss: 0.6924, Train Acc: 0.0000
  Test Loss: 0.6947, Test Acc: 0.0000
Epoch 250/500
  Train Loss: 0.6923, Train Acc: 0.0000
  Test Loss: 0.6950, Test Acc: 0.0000
Epoch 300/500
  Train Loss: 0.6922, Train Acc: 0.0000
  Test Loss: 0.6952, Test Acc: 0.0000
Epoch 350/500
  Train Loss: 0.6922, Train Acc: 0.0000
  Test Loss: 0.6954, Test Acc: 0.0000
Epoch 400/500
  Train Loss: 0.6921, Train Acc: 0.0000
  Test Loss: 0.6955, Test Acc: 0.0000
Epoch 450/500
  Train Loss: 0.6921, Train Acc: 0.0000
  Test Loss: 0.6956, Test Acc: 0.0000
Epoch 500/500
  Train Loss: 0.6921, Train Acc: 0.0000
  Test Loss: 0.6956, Test Acc: 0.0000

Training complete!

Conclusion: it runs but the results are disappointing...¶

Me: "So, AI, what's wrong with my code ?"
AI: "Missing data binarization: Your loss function uses binary cross-entropy, but Automation_Probability_2030 in your CSV is likely a continuous value (0.0 to 1.0), not binary (0 or 1). You need to convert it to binary labels"
Me: "What??? The output in the dataset is a probability. You just ignore it by replacing it by a binary value ! Anyway, let's have a look on your suggestion:

Experiment 2- switching from probabitity to boolean¶

Here is the code¶

In [4]:
import jax
import jax.numpy as jnp
from jax import grad, jit
import csv

# Load and preprocess data
def load_data(filename, max_rows=5000, threshold=0.5):
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        rows = list(reader)[:max_rows]  # Limit data for memory
   
    # Select numerical features
    feature_names = ['Average_Salary', 'Years_Experience', 
                     'AI_Exposure_Index', 'Tech_Growth_Factor', 'Skill_1',
                     'Skill_2', 'Skill_3', 'Skill_4',
                     'Skill_5', 'Skill_6', 'Skill_7',
                     'Skill_8', 'Skill_9', 'Skill_10']
   
    X = []
    y = []
    for row in rows:
        features = [float(row[name]) for name in feature_names]
        X.append(features)
        # Convert probability to binary label   ====================== this is the new code   
        prob = float(row['Automation_Probability_2030'])
        y.append(1.0 if prob >= threshold else 0.0)
   
    return jnp.array(X), jnp.array(y)

# Normalize features
def normalize(X):
    mean = jnp.mean(X, axis=0)
    std = jnp.std(X, axis=0) + 1e-8
    return (X - mean) / std

# Initialize network parameters
def init_network(layer_sizes, key):
    params = []
    for i in range(len(layer_sizes) - 1):
        key, subkey = jax.random.split(key)
        w = jax.random.normal(subkey, (layer_sizes[i], layer_sizes[i+1])) * 0.1
        b = jnp.zeros(layer_sizes[i+1])
        params.append((w, b))
    return params

# Forward pass
def forward(params, x):
    for i, (w, b) in enumerate(params[:-1]):
        x = jnp.tanh(jnp.dot(x, w) + b)
    w, b = params[-1]
    return jnp.dot(x, w) + b

# Sigmoid activation
def sigmoid(x):
    return 1 / (1 + jnp.exp(-x))

# Binary cross-entropy loss 
def loss_fn(params, x, y):
    logits = forward(params, x)
    probs = sigmoid(logits.squeeze())
    return -jnp.mean(y * jnp.log(probs + 1e-8) + (1 - y) * jnp.log(1 - probs + 1e-8))

# Prediction function
def predict(params, x):
    logits = forward(params, x)
    return (sigmoid(logits.squeeze()) > 0.5).astype(jnp.float32)

# Training step
@jit
def train_step(params, x, y, lr):
    loss, grads = jax.value_and_grad(loss_fn)(params, x, y)
    params = [(w - lr * dw, b - lr * db) for (w, b), (dw, db) in zip(params, grads)]
    return params, loss

# Main training loop
def train(X, y, layer_sizes=[14, 8, 1], epochs=500, lr=0.01, batch_size=64):
    key = jax.random.PRNGKey(42)
   
    # Normalize data
    X_norm = normalize(X)
   
    # Split data (80/20)
    n_train = int(0.8 * len(X))
    X_train, X_test = X_norm[:n_train], X_norm[n_train:]
    y_train, y_test = y[:n_train], y[n_train:]
   
    # Initialize network
    params = init_network(layer_sizes, key)
   
    # Training loop
    n_batches = len(X_train) // batch_size
   
    for epoch in range(epochs):
        # Shuffle data
        key, subkey = jax.random.split(key)
        perm = jax.random.permutation(subkey, len(X_train))
        X_train_shuffled = X_train[perm]
        y_train_shuffled = y_train[perm]
       
        for i in range(n_batches):
            start = i * batch_size
            end = start + batch_size
            X_batch = X_train_shuffled[start:end]
            y_batch = y_train_shuffled[start:end]
           
            params, batch_loss = train_step(params, X_batch, y_batch, lr)
       
        if (epoch + 1) % 50 == 0:
            train_loss = loss_fn(params, X_train, y_train)
            test_loss = loss_fn(params, X_test, y_test)
           
            train_preds = predict(params, X_train)
            test_preds = predict(params, X_test)
            train_acc = jnp.mean(train_preds == y_train)
            test_acc = jnp.mean(test_preds == y_test)
           
            print(f"Epoch {epoch+1}/{epochs}")
            print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
            print(f"  Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
   
    return params

if __name__ == "__main__":
    # Load data
    print("Loading data...")
    X, y = load_data("datasets/AI_Impact_on_Jobs_2030.csv")
    print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")
   
    # Train model
    print("\nTraining neural network...")
    params = train(X, y)
   
    print("\nTraining complete!")
Loading data...
Dataset: 3000 samples, 14 features

Training neural network...
Epoch 50/500
  Train Loss: 0.6914, Train Acc: 0.5296
  Test Loss: 0.6963, Test Acc: 0.4833
Epoch 100/500
  Train Loss: 0.6902, Train Acc: 0.5258
  Test Loss: 0.6985, Test Acc: 0.4850
Epoch 150/500
  Train Loss: 0.6896, Train Acc: 0.5300
  Test Loss: 0.7002, Test Acc: 0.4817
Epoch 200/500
  Train Loss: 0.6893, Train Acc: 0.5358
  Test Loss: 0.7015, Test Acc: 0.4750
Epoch 250/500
  Train Loss: 0.6892, Train Acc: 0.5350
  Test Loss: 0.7022, Test Acc: 0.4733
Epoch 300/500
  Train Loss: 0.6891, Train Acc: 0.5383
  Test Loss: 0.7027, Test Acc: 0.4667
Epoch 350/500
  Train Loss: 0.6890, Train Acc: 0.5375
  Test Loss: 0.7028, Test Acc: 0.4683
Epoch 400/500
  Train Loss: 0.6889, Train Acc: 0.5387
  Test Loss: 0.7031, Test Acc: 0.4683
Epoch 450/500
  Train Loss: 0.6889, Train Acc: 0.5396
  Test Loss: 0.7034, Test Acc: 0.4683
Epoch 500/500
  Train Loss: 0.6888, Train Acc: 0.5379
  Test Loss: 0.7035, Test Acc: 0.4683

Training complete!

Conclusion : better but ... a monkey would do something similar...(53% accuracy). Let's do something to add two important missing features: job title and education level.¶

Me: "AI, add the job title and the education level as features to the code.
AI: "Here it is.."

Experiment 3 - dealing with categories¶

What are the available strategies ?¶

  1. Label Encoding: assigns a unique integer to each category.

    Use case: Suitable for ordinal categorical features where there's a meaningful order (e.g., "low," "medium," "high").
    Caution: Can introduce artificial order for nominal data, which might negatively impact models expecting independent features.

  2. One-Hot Encoding: creates new binary columns for each category, indicating presence (1) or absence (0).

    Use case: Ideal for nominal categorical features where no inherent order exists (e.g., "red," "blue," "green").
    Caution: Can lead to high dimensionality ("curse of dimensionality") for features with many unique categories (high cardinality).

  3. Target Encoding (Mean Encoding): replaces categories with the mean of the target variable for that category.

    Use case: Effective for high-cardinality features, as it reduces dimensionality while capturing predictive information.
    Caution: Prone to data leakage if not implemented carefully (e.g., calculating means on the entire dataset before splitting into train/test). Use cross-validation or calculate means only on the training set.

  4. Binary Encoding: converts categories to binary code, then creates new columns for each bit in the binary representation.

    Use case: A compromise between Label and One-Hot Encoding, reducing dimensionality compared to One-Hot while still providing a numerical representation.

  5. Feature Hashing (Hashing Trick): maps categories to a fixed-size vector using a hash function.

    Use case: Efficient for very high-cardinality features and when memory is a concern.
    Caution: Can lead to collisions (different categories mapping to the same hash value), potentially impacting model accuracy.

Here is the code (One-Hot Encoding strategy)¶

In [5]:
import jax
import jax.numpy as jnp
from jax import grad, jit
import csv

# Define categorical mappings
JOB_TITLES = ['AI Engineer', 'Chef', 'Construction Worker', 'Customer Support', 
              'Data Scientist', 'Doctor', 'Financial Analyst', 'Graphic Designer', 
              'HR Specialist', 'Lawyer', 'Marketing Manager', 'Mechanic', 
              'Nurse', 'Research Scientist', 'Retail Worker', 'Security Guard', 
              'Software Engineer', 'Teacher', 'Truck Driver', 'UX Researcher']

EDUCATION_LEVELS = ["Bachelor's", 'High School', "Master's", 'PhD']

# Load and preprocess data
def load_data(filename, max_rows=5000, threshold=0.5):
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        rows = list(reader)[:max_rows]  # Limit data for memory
   
    # Select numerical features
    numerical_features = ['Average_Salary', 'Years_Experience', 
                         'AI_Exposure_Index', 'Tech_Growth_Factor', 'Skill_1',
                         'Skill_2', 'Skill_3', 'Skill_4',
                         'Skill_5', 'Skill_6', 'Skill_7',
                         'Skill_8', 'Skill_9', 'Skill_10']
   
    X = []
    y = []
    
    for row in rows:
        # Extract numerical features
        features = [float(row[name]) for name in numerical_features]

        # [Philippe] This is the added code : since both fields are categories, we need to one-hot encode them
        #            We now have 38 total features (14 numerical + 20 job categories + 4 education categories)
        
        # One-hot encode Job_Title.   
        job_title = row['Job_Title']
        job_encoding = [1.0 if job_title == jt else 0.0 for jt in JOB_TITLES]
        
        # One-hot encode Education_Level
        education = row['Education_Level']
        edu_encoding = [1.0 if education == el else 0.0 for el in EDUCATION_LEVELS]
        
        # Combine all features
        all_features = features + job_encoding + edu_encoding
        X.append(all_features)
        
        # Convert probability to binary label
        prob = float(row['Automation_Probability_2030'])
        y.append(1.0 if prob >= threshold else 0.0)
   
    return jnp.array(X), jnp.array(y)

# Normalize features
def normalize(X):
    mean = jnp.mean(X, axis=0)
    std = jnp.std(X, axis=0) + 1e-8
    return (X - mean) / std

# Initialize network parameters
def init_network(layer_sizes, key):
    params = []
    for i in range(len(layer_sizes) - 1):
        key, subkey = jax.random.split(key)
        w = jax.random.normal(subkey, (layer_sizes[i], layer_sizes[i+1])) * 0.1
        b = jnp.zeros(layer_sizes[i+1])
        params.append((w, b))
    return params

# Forward pass
def forward(params, x):
    for i, (w, b) in enumerate(params[:-1]):
        x = jnp.tanh(jnp.dot(x, w) + b)
    w, b = params[-1]
    return jnp.dot(x, w) + b

# Sigmoid activation
def sigmoid(x):
    return 1 / (1 + jnp.exp(-x))

# Binary cross-entropy loss
def loss_fn(params, x, y):
    logits = forward(params, x)
    probs = sigmoid(logits.squeeze())
    return -jnp.mean(y * jnp.log(probs + 1e-8) + (1 - y) * jnp.log(1 - probs + 1e-8))

# Prediction function
def predict(params, x):
    logits = forward(params, x)
    return (sigmoid(logits.squeeze()) > 0.5).astype(jnp.float32)

# Training step
@jit
def train_step(params, x, y, lr):
    loss, grads = jax.value_and_grad(loss_fn)(params, x, y)
    params = [(w - lr * dw, b - lr * db) for (w, b), (dw, db) in zip(params, grads)]
    return params, loss

# Main training loop
def train(X, y, layer_sizes=[38, 24, 12, 1], epochs=500, lr=0.01, batch_size=64):
    key = jax.random.PRNGKey(42)
   
    # Normalize data
    X_norm = normalize(X)
   
    # Split data (80/20)
    n_train = int(0.8 * len(X))
    X_train, X_test = X_norm[:n_train], X_norm[n_train:]
    y_train, y_test = y[:n_train], y[n_train:]
   
    # Initialize network
    params = init_network(layer_sizes, key)
   
    # Training loop
    n_batches = len(X_train) // batch_size
   
    for epoch in range(epochs):
        # Shuffle data
        key, subkey = jax.random.split(key)
        perm = jax.random.permutation(subkey, len(X_train))
        X_train_shuffled = X_train[perm]
        y_train_shuffled = y_train[perm]
       
        for i in range(n_batches):
            start = i * batch_size
            end = start + batch_size
            X_batch = X_train_shuffled[start:end]
            y_batch = y_train_shuffled[start:end]
           
            params, batch_loss = train_step(params, X_batch, y_batch, lr)
       
        if (epoch + 1) % 50 == 0:
            train_loss = loss_fn(params, X_train, y_train)
            test_loss = loss_fn(params, X_test, y_test)
           
            train_preds = predict(params, X_train)
            test_preds = predict(params, X_test)
            train_acc = jnp.mean(train_preds == y_train)
            test_acc = jnp.mean(test_preds == y_test)
           
            print(f"Epoch {epoch+1}/{epochs}")
            print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
            print(f"  Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
   
    return params

if __name__ == "__main__":
    # Load data
    print("Loading data...")
    X, y = load_data("datasets/AI_Impact_on_Jobs_2030.csv")
    print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")
    print(f"Feature breakdown:")
    print(f"  - 14 numerical features")
    print(f"  - 20 job title categories (one-hot encoded)")
    print(f"  - 4 education level categories (one-hot encoded)")
    print(f"  - Total: 38 features")
   
    # Train model
    print("\nTraining neural network...")
    params = train(X, y)
   
    print("\nTraining complete!")
Loading data...
Dataset: 3000 samples, 38 features
Feature breakdown:
  - 14 numerical features
  - 20 job title categories (one-hot encoded)
  - 4 education level categories (one-hot encoded)
  - Total: 38 features

Training neural network...
Epoch 50/500
  Train Loss: 0.3541, Train Acc: 0.7925
  Test Loss: 0.3874, Test Acc: 0.7483
Epoch 100/500
  Train Loss: 0.3342, Train Acc: 0.8117
  Test Loss: 0.3759, Test Acc: 0.7333
Epoch 150/500
  Train Loss: 0.3175, Train Acc: 0.8317
  Test Loss: 0.3754, Test Acc: 0.7533
Epoch 200/500
  Train Loss: 0.2995, Train Acc: 0.8492
  Test Loss: 0.3886, Test Acc: 0.7517
Epoch 250/500
  Train Loss: 0.2798, Train Acc: 0.8713
  Test Loss: 0.4066, Test Acc: 0.7483
Epoch 300/500
  Train Loss: 0.2553, Train Acc: 0.8846
  Test Loss: 0.4264, Test Acc: 0.7533
Epoch 350/500
  Train Loss: 0.2269, Train Acc: 0.9004
  Test Loss: 0.4545, Test Acc: 0.7567
Epoch 400/500
  Train Loss: 0.1980, Train Acc: 0.9188
  Test Loss: 0.4982, Test Acc: 0.7483
Epoch 450/500
  Train Loss: 0.1700, Train Acc: 0.9367
  Test Loss: 0.5620, Test Acc: 0.7550
Epoch 500/500
  Train Loss: 0.1428, Train Acc: 0.9508
  Test Loss: 0.6325, Test Acc: 0.7600

Training complete!

Nice! we have a test accuracy of 76%..... but I see two issues:¶

  • why is there such a large gap between the training accuracy and the test accuracy ?
  • why do we have so many additionnal features ? job title is a category, you can (most of the time) not be a computer scientist AND a plumber AND a nurse at the same time. Why do we need a feature for each job title value..

AI answer to the first issue: the large gap between 95% training accuracy and 76% test accuracy is a classic sign of overfitting. Here are the main causes and solutions:

  1. Overfitting (Most Likely). Your model is memorizing the training data rather than learning generalizable patterns. Signs in your results:
    • Training loss keeps decreasing (0.35 → 0.14)
    • Test loss actually increases (0.39 → 0.63)
    • This divergence is a textbook overfitting pattern
  2. Model Complexity
    • You have 38 input features for only 2,400 training samples
    • The network has 3 hidden layers [38 → 24 → 12 → 1]
    • This gives the model enough capacity to memorize training examples
  3. Small Dataset
    • 2,400 training samples / 38 features = ~63 samples per feature
    • This ratio is relatively low for deep learning

Let me create an improved version with regularization techniques:

Me: "The problem now: I have no clue whether the AI is really improving things or just hallucinating..."

Experiment 4 - adding regularization¶

Here is the code¶

In [6]:
import jax
import jax.numpy as jnp
from jax import grad, jit, random
from functools import partial
import csv

# Define categorical mappings
JOB_TITLES = ['AI Engineer', 'Chef', 'Construction Worker', 'Customer Support', 
              'Data Scientist', 'Doctor', 'Financial Analyst', 'Graphic Designer', 
              'HR Specialist', 'Lawyer', 'Marketing Manager', 'Mechanic', 
              'Nurse', 'Research Scientist', 'Retail Worker', 'Security Guard', 
              'Software Engineer', 'Teacher', 'Truck Driver', 'UX Researcher']

EDUCATION_LEVELS = ["Bachelor's", 'High School', "Master's", 'PhD']

# Load and preprocess data
def load_data(filename, max_rows=5000, threshold=0.5):
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        rows = list(reader)[:max_rows]
   
    numerical_features = ['Average_Salary', 'Years_Experience', 
                         'AI_Exposure_Index', 'Tech_Growth_Factor', 'Skill_1',
                         'Skill_2', 'Skill_3', 'Skill_4',
                         'Skill_5', 'Skill_6', 'Skill_7',
                         'Skill_8', 'Skill_9', 'Skill_10']
   
    X = []
    y = []
    
    for row in rows:
        features = [float(row[name]) for name in numerical_features]
        
        job_title = row['Job_Title']
        job_encoding = [1.0 if job_title == jt else 0.0 for jt in JOB_TITLES]
        
        education = row['Education_Level']
        edu_encoding = [1.0 if education == el else 0.0 for el in EDUCATION_LEVELS]
        
        all_features = features + job_encoding + edu_encoding
        X.append(all_features)
        
        prob = float(row['Automation_Probability_2030'])
        y.append(1.0 if prob >= threshold else 0.0)
   
    return jnp.array(X), jnp.array(y)

# Normalize features
def normalize(X):
    mean = jnp.mean(X, axis=0)
    std = jnp.std(X, axis=0) + 1e-8
    return (X - mean) / std, mean, std

# Initialize network parameters with Xavier initialization
def init_network(layer_sizes, key):
    params = []
    for i in range(len(layer_sizes) - 1):
        key, subkey = random.split(key)
        scale = jnp.sqrt(2.0 / (layer_sizes[i] + layer_sizes[i+1]))
        w = random.normal(subkey, (layer_sizes[i], layer_sizes[i+1])) * scale
        b = jnp.zeros(layer_sizes[i+1])
        params.append((w, b))
    return params

# Forward pass
def forward(params, x):
    for i, (w, b) in enumerate(params[:-1]):
        x = jnp.tanh(jnp.dot(x, w) + b)
    w, b = params[-1]
    return jnp.dot(x, w) + b

# Sigmoid activation
def sigmoid(x):
    return 1 / (1 + jnp.exp(-jnp.clip(x, -20, 20)))

# Binary cross-entropy loss with L2 regularization
def loss_fn(params, x, y, l2_reg):
    logits = forward(params, x)
    probs = sigmoid(logits.squeeze())
    
    # Cross-entropy loss
    ce_loss = -jnp.mean(y * jnp.log(probs + 1e-8) + (1 - y) * jnp.log(1 - probs + 1e-8))
    
    # L2 regularization (weight decay)
    l2_loss = 0.0
    for w, b in params:
        l2_loss += jnp.sum(w ** 2)
    
    return ce_loss + l2_reg * l2_loss

# Prediction function
def predict(params, x):
    logits = forward(params, x)
    return (sigmoid(logits.squeeze()) > 0.5).astype(jnp.float32)

# Training step
@jit
def train_step(params, x, y, lr, l2_reg):
    loss, grads = jax.value_and_grad(loss_fn)(params, x, y, l2_reg)
    params = [(w - lr * dw, b - lr * db) for (w, b), (dw, db) in zip(params, grads)]
    return params, loss

# Evaluation loss (no regularization for reporting)
def eval_loss(params, x, y):
    logits = forward(params, x)
    probs = sigmoid(logits.squeeze())
    return -jnp.mean(y * jnp.log(probs + 1e-8) + (1 - y) * jnp.log(1 - probs + 1e-8))

# Main training loop with early stopping
def train(X, y, layer_sizes=[38, 16, 1], epochs=500, lr=0.01, batch_size=64, 
          l2_reg=0.01, patience=150):
    key = random.PRNGKey(42)
   
    # Normalize data
    X_norm, mean, std = normalize(X)
   
    # Split data (80/20)
    n_train = int(0.8 * len(X))
    X_train, X_test = X_norm[:n_train], X_norm[n_train:]
    y_train, y_test = y[:n_train], y[n_train:]
   
    # Initialize network
    params = init_network(layer_sizes, key)
   
    # Early stopping variables
    best_test_loss = float('inf')
    best_params = params
    patience_counter = 0
   
    # Training loop
    n_batches = len(X_train) // batch_size
   
    print(f"\nTraining Configuration:")
    print(f"  Network: {layer_sizes}")
    print(f"  L2 Regularization: {l2_reg}")
    print(f"  Learning Rate: {lr}")
    print(f"  Batch Size: {batch_size}")
    print(f"  Early Stopping Patience: {patience}\n")
   
    for epoch in range(epochs):
        # Shuffle data
        key, subkey = random.split(key)
        perm = random.permutation(subkey, len(X_train))
        X_train_shuffled = X_train[perm]
        y_train_shuffled = y_train[perm]
       
        for i in range(n_batches):
            start = i * batch_size
            end = start + batch_size
            X_batch = X_train_shuffled[start:end]
            y_batch = y_train_shuffled[start:end]
            
            params, batch_loss = train_step(params, X_batch, y_batch, lr, l2_reg)
       
        if (epoch + 1) % 50 == 0:
            # Evaluate
            train_loss = eval_loss(params, X_train, y_train)
            test_loss = eval_loss(params, X_test, y_test)
           
            train_preds = predict(params, X_train)
            test_preds = predict(params, X_test)
            train_acc = jnp.mean(train_preds == y_train)
            test_acc = jnp.mean(test_preds == y_test)
           
            print(f"Epoch {epoch+1}/{epochs}")
            print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
            print(f"  Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
            print(f"  Accuracy Gap: {train_acc - test_acc:.4f}")
            
            # Early stopping check
            if test_loss < best_test_loss:
                best_test_loss = test_loss
                best_params = params
                patience_counter = 0
            else:
                patience_counter += 50
                
            if patience_counter >= patience:
                print(f"\nEarly stopping at epoch {epoch+1}")
                print(f"Best test loss: {best_test_loss:.4f}")
                break
   
    return best_params

if __name__ == "__main__":
    print("="*70)
    print("ADDRESSING OVERFITTING: COMPARING DIFFERENT REGULARIZATION STRATEGIES")
    print("="*70)
    
    # Load data
    print("\nLoading data...")
    X, y = load_data("datasets/AI_Impact_on_Jobs_2030.csv")
    print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")
    print(f"\nProblem: Original model had 95% train accuracy but only 76% test accuracy")
    print("This 19% gap indicates OVERFITTING - the model memorized training data\n")
    
    print("\n" + "="*70)
    print("EXPERIMENT 1: Original Architecture (Baseline)")
    print("="*70)
    print("Strategy: No regularization, deeper network [38, 24, 12, 1]")
    params1 = train(X, y, layer_sizes=[38, 24, 12, 1], epochs=500, lr=0.01, 
                    l2_reg=0.0, patience=150)
    
    print("\n" + "="*70)
    print("EXPERIMENT 2: Simpler Architecture")
    print("="*70)
    print("Strategy: Reduce model complexity [38, 16, 1]")
    params2 = train(X, y, layer_sizes=[38, 16, 1], epochs=500, lr=0.01, 
                    l2_reg=0.0, patience=150)
    
    print("\n" + "="*70)
    print("EXPERIMENT 3: L2 Regularization (Weak)")
    print("="*70)
    print("Strategy: Add weak L2 penalty (0.001) to prevent large weights")
    params3 = train(X, y, layer_sizes=[38, 16, 1], epochs=500, lr=0.01, 
                    l2_reg=0.001, patience=150)
    
    print("\n" + "="*70)
    print("EXPERIMENT 4: L2 Regularization (Strong)")
    print("="*70)
    print("Strategy: Add stronger L2 penalty (0.01) to enforce weight decay")
    params4 = train(X, y, layer_sizes=[38, 16, 1], epochs=500, lr=0.01, 
                    l2_reg=0.01, patience=150)
    
    print("\n" + "="*70)
    print("SUMMARY & RECOMMENDATIONS")
    print("="*70)
 
======================================================================
ADDRESSING OVERFITTING: COMPARING DIFFERENT REGULARIZATION STRATEGIES
======================================================================

Loading data...
Dataset: 3000 samples, 38 features

Problem: Original model had 95% train accuracy but only 76% test accuracy
This 19% gap indicates OVERFITTING - the model memorized training data


======================================================================
EXPERIMENT 1: Original Architecture (Baseline)
======================================================================
Strategy: No regularization, deeper network [38, 24, 12, 1]

Training Configuration:
  Network: [38, 24, 12, 1]
  L2 Regularization: 0.0
  Learning Rate: 0.01
  Batch Size: 64
  Early Stopping Patience: 150

Epoch 50/500
  Train Loss: 0.3419, Train Acc: 0.8100
  Test Loss: 0.3874, Test Acc: 0.7583
  Accuracy Gap: 0.0517
Epoch 100/500
  Train Loss: 0.3138, Train Acc: 0.8408
  Test Loss: 0.3822, Test Acc: 0.7600
  Accuracy Gap: 0.0808
Epoch 150/500
  Train Loss: 0.2921, Train Acc: 0.8600
  Test Loss: 0.3913, Test Acc: 0.7467
  Accuracy Gap: 0.1133
Epoch 200/500
  Train Loss: 0.2704, Train Acc: 0.8804
  Test Loss: 0.4129, Test Acc: 0.7383
  Accuracy Gap: 0.1421
Epoch 250/500
  Train Loss: 0.2487, Train Acc: 0.8892
  Test Loss: 0.4417, Test Acc: 0.7450
  Accuracy Gap: 0.1442

Early stopping at epoch 250
Best test loss: 0.3822

======================================================================
EXPERIMENT 2: Simpler Architecture
======================================================================
Strategy: Reduce model complexity [38, 16, 1]

Training Configuration:
  Network: [38, 16, 1]
  L2 Regularization: 0.0
  Learning Rate: 0.01
  Batch Size: 64
  Early Stopping Patience: 150

Epoch 50/500
  Train Loss: 0.3571, Train Acc: 0.7846
  Test Loss: 0.3850, Test Acc: 0.7500
  Accuracy Gap: 0.0346
Epoch 100/500
  Train Loss: 0.3372, Train Acc: 0.7983
  Test Loss: 0.3755, Test Acc: 0.7550
  Accuracy Gap: 0.0433
Epoch 150/500
  Train Loss: 0.3259, Train Acc: 0.8175
  Test Loss: 0.3745, Test Acc: 0.7533
  Accuracy Gap: 0.0642
Epoch 200/500
  Train Loss: 0.3171, Train Acc: 0.8233
  Test Loss: 0.3759, Test Acc: 0.7483
  Accuracy Gap: 0.0750
Epoch 250/500
  Train Loss: 0.3089, Train Acc: 0.8354
  Test Loss: 0.3785, Test Acc: 0.7450
  Accuracy Gap: 0.0904
Epoch 300/500
  Train Loss: 0.3006, Train Acc: 0.8471
  Test Loss: 0.3807, Test Acc: 0.7450
  Accuracy Gap: 0.1021

Early stopping at epoch 300
Best test loss: 0.3745

======================================================================
EXPERIMENT 3: L2 Regularization (Weak)
======================================================================
Strategy: Add weak L2 penalty (0.001) to prevent large weights

Training Configuration:
  Network: [38, 16, 1]
  L2 Regularization: 0.001
  Learning Rate: 0.01
  Batch Size: 64
  Early Stopping Patience: 150

Epoch 50/500
  Train Loss: 0.3578, Train Acc: 0.7846
  Test Loss: 0.3858, Test Acc: 0.7483
  Accuracy Gap: 0.0362
Epoch 100/500
  Train Loss: 0.3387, Train Acc: 0.7975
  Test Loss: 0.3761, Test Acc: 0.7533
  Accuracy Gap: 0.0442
Epoch 150/500
  Train Loss: 0.3289, Train Acc: 0.8179
  Test Loss: 0.3744, Test Acc: 0.7567
  Accuracy Gap: 0.0613
Epoch 200/500
  Train Loss: 0.3217, Train Acc: 0.8296
  Test Loss: 0.3750, Test Acc: 0.7517
  Accuracy Gap: 0.0779
Epoch 250/500
  Train Loss: 0.3155, Train Acc: 0.8354
  Test Loss: 0.3765, Test Acc: 0.7500
  Accuracy Gap: 0.0854
Epoch 300/500
  Train Loss: 0.3095, Train Acc: 0.8421
  Test Loss: 0.3774, Test Acc: 0.7483
  Accuracy Gap: 0.0938

Early stopping at epoch 300
Best test loss: 0.3744

======================================================================
EXPERIMENT 4: L2 Regularization (Strong)
======================================================================
Strategy: Add stronger L2 penalty (0.01) to enforce weight decay

Training Configuration:
  Network: [38, 16, 1]
  L2 Regularization: 0.01
  Learning Rate: 0.01
  Batch Size: 64
  Early Stopping Patience: 150

Epoch 50/500
  Train Loss: 0.3667, Train Acc: 0.7846
  Test Loss: 0.3950, Test Acc: 0.7500
  Accuracy Gap: 0.0346
Epoch 100/500
  Train Loss: 0.3565, Train Acc: 0.7912
  Test Loss: 0.3878, Test Acc: 0.7483
  Accuracy Gap: 0.0429
Epoch 150/500
  Train Loss: 0.3558, Train Acc: 0.7875
  Test Loss: 0.3869, Test Acc: 0.7433
  Accuracy Gap: 0.0442
Epoch 200/500
  Train Loss: 0.3563, Train Acc: 0.7846
  Test Loss: 0.3870, Test Acc: 0.7483
  Accuracy Gap: 0.0362
Epoch 250/500
  Train Loss: 0.3567, Train Acc: 0.7867
  Test Loss: 0.3871, Test Acc: 0.7383
  Accuracy Gap: 0.0483
Epoch 300/500
  Train Loss: 0.3570, Train Acc: 0.7837
  Test Loss: 0.3867, Test Acc: 0.7400
  Accuracy Gap: 0.0437
Epoch 350/500
  Train Loss: 0.3571, Train Acc: 0.7842
  Test Loss: 0.3871, Test Acc: 0.7400
  Accuracy Gap: 0.0442
Epoch 400/500
  Train Loss: 0.3571, Train Acc: 0.7837
  Test Loss: 0.3870, Test Acc: 0.7417
  Accuracy Gap: 0.0421
Epoch 450/500
  Train Loss: 0.3571, Train Acc: 0.7854
  Test Loss: 0.3868, Test Acc: 0.7300
  Accuracy Gap: 0.0554

Early stopping at epoch 450
Best test loss: 0.3867

======================================================================
SUMMARY & RECOMMENDATIONS
======================================================================

Overfitting Causes Identified:

  1. Model complexity: 3 hidden layers with many parameters
  2. Small dataset: Only ~63 samples per feature
  3. No regularization: Model free to memorize patterns
  4. Training too long: 500 epochs without early stopping

Best Practices to Reduce Overfitting:

  1. ✓ Simplify architecture: Use fewer/smaller hidden layers
  2. ✓ Add L2 regularization: Penalize large weights (try 0.001-0.01)
  3. ✓ Use early stopping: Stop when test loss stops improving
  4. Consider dropout: Randomly disable neurons during training
  5. Get more data: Increase training samples if possible
  6. Feature selection: Remove irrelevant features

Expected Results:

  • Simpler models: Smaller train-test gap (5-10% instead of 19%)
  • L2 regularization: Test accuracy should improve by 2-5%
  • Early stopping: Prevents overfitting in later epochs """)

The key insight: your 95% training accuracy was actually a warning sign, not a success! A simpler model that gets 82% on both train and test is much better than one that gets 95% train and 76% test.

Conclusion: accuracy is around 73%. I still see an issue with the yes/no output to replace a probability¶

Experiment 5 - it is a regression problem, not a classification problem !¶

Me: "AI, in this code (the one provided initialy, without the two features added later), the loss function uses binary cross-entropy, but Automation_Probability_2030 in the CSV is likely a continuous value (0.0 to 1.0), What loss function should i use instead ?
AI: "If Automation_Probability_2030 is a continuous value between 0.0 and 1.0 (which makes sense for a probability), this is a regression problem, not a binary classification problem. Here are the changes you should make:

  • Use Mean Squared Error (MSE) loss instead
  • Modify the output to use sigmoid activation:
  • Update the prediction function for regression
  • Change accuracy metrics to regression metrics. Replace accuracy with Mean Absolute Error (MAE) or R² score"

Alternative: If you prefer treating this as classification (high/low automation risk), you could threshold the continuous values at 0.5 and keep binary cross-entropy, but MSE is more appropriate for continuous targets.

Here is the code (note: only numeric values are taken into account in this example)¶

In [7]:
import jax
import jax.numpy as jnp
from jax import grad, jit
import csv

# Load and preprocess data
# [Philippe] Still the same question here: why do we get rid of the non-numerical features ?? 

def load_data(filename, max_rows=5000):
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        rows = list(reader)[:max_rows]  # Limit data for memory
   
    # Select numerical features
    feature_names = ['Average_Salary', 'Years_Experience', 
                     'AI_Exposure_Index', 'Tech_Growth_Factor', 'Skill_1',
                     'Skill_2', 'Skill_3', 'Skill_4',
                     'Skill_5', 'Skill_6', 'Skill_7',
                     'Skill_8', 'Skill_9', 'Skill_10']
   
    X = []
    y = []
    for row in rows:
        features = [float(row[name]) for name in feature_names]
        X.append(features)
        y.append(float(row['Automation_Probability_2030']))
   
    return jnp.array(X), jnp.array(y)

# Normalize features
def normalize(X):
    mean = jnp.mean(X, axis=0)
    std = jnp.std(X, axis=0) + 1e-8
    return (X - mean) / std

# Initialize network parameters
# [Philippe] In this code (when called), we will have 4 layers: 1 input (14 features => 14 neurons), 1 hidden (15 features, why one more ??), another hidden (8 neurons) and then an output with 1 neuron

def init_network(layer_sizes, key):
    params = []
    for i in range(len(layer_sizes) - 1):
        key, subkey = jax.random.split(key)
        w = jax.random.normal(subkey, (layer_sizes[i], layer_sizes[i+1])) * 0.1
        b = jnp.zeros(layer_sizes[i+1])
        params.append((w, b))
    return params

# Forward pass with sigmoid on output
def forward(params, x):
    for i, (w, b) in enumerate(params[:-1]):
        x = jnp.tanh(jnp.dot(x, w) + b)
    w, b = params[-1]
    logits = jnp.dot(x, w) + b
    return sigmoid(logits)  # Add sigmoid to bound output to [0, 1]

# Sigmoid activation
def sigmoid(x):
    return 1 / (1 + jnp.exp(-x))

# MSE loss for regression
def loss_fn(params, x, y):
    preds = forward(params, x).squeeze()
    return jnp.mean((preds - y) ** 2)

# Prediction function (no thresholding needed for regression)
def predict(params, x):
    return forward(params, x).squeeze()

# Training step
@jit
def train_step(params, x, y, lr):
    loss, grads = jax.value_and_grad(loss_fn)(params, x, y)
    params = [(w - lr * dw, b - lr * db) for (w, b), (dw, db) in zip(params, grads)]
    return params, loss

# Main training loop
def train(X, y, layer_sizes=[14, 15, 8, 1], epochs=500, lr=0.01, batch_size=64):
    key = jax.random.PRNGKey(42)
   
    # Normalize data
    X_norm = normalize(X)
   
    # Split data (80/20)
    n_train = int(0.8 * len(X))
    X_train, X_test = X_norm[:n_train], X_norm[n_train:]
    y_train, y_test = y[:n_train], y[n_train:]
   
    # Initialize network
    params = init_network(layer_sizes, key)
   
    # Training loop
    n_batches = len(X_train) // batch_size
   
    for epoch in range(epochs):
        # Shuffle data
        key, subkey = jax.random.split(key)
        perm = jax.random.permutation(subkey, len(X_train))
        X_train_shuffled = X_train[perm]
        y_train_shuffled = y_train[perm]
       
        for i in range(n_batches):
            start = i * batch_size
            end = start + batch_size
            X_batch = X_train_shuffled[start:end]
            y_batch = y_train_shuffled[start:end]
           
            params, batch_loss = train_step(params, X_batch, y_batch, lr)

        if (epoch + 1) % 50 == 0:
            train_loss = loss_fn(params, X_train, y_train)
            test_loss = loss_fn(params, X_test, y_test)
            
            train_preds = predict(params, X_train)
            test_preds = predict(params, X_test)
            train_mae = jnp.mean(jnp.abs(train_preds - y_train))
            test_mae = jnp.mean(jnp.abs(test_preds - y_test))
    
            print(f"Epoch {epoch+1}/{epochs}")
            print(f"  Train Loss (MSE): {train_loss:.4f}, Train MAE: {train_mae:.4f}")
            print(f"  Test Loss (MSE): {test_loss:.4f}, Test MAE: {test_mae:.4f}")
          
    return params

if __name__ == "__main__":
    # Load data
    print("Loading data...")
    X, y = load_data("datasets/AI_Impact_on_Jobs_2030.csv")
    print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")
   
    # Train model
    print("\nTraining neural network...")
    params = train(X, y)
   
    print("\nTraining complete!")
    print(y)
Loading data...
Dataset: 3000 samples, 14 features

Training neural network...
Epoch 50/500
  Train Loss (MSE): 0.0625, Train MAE: 0.2120
  Test Loss (MSE): 0.0571, Test MAE: 0.2013
Epoch 100/500
  Train Loss (MSE): 0.0624, Train MAE: 0.2119
  Test Loss (MSE): 0.0572, Test MAE: 0.2015
Epoch 150/500
  Train Loss (MSE): 0.0624, Train MAE: 0.2117
  Test Loss (MSE): 0.0573, Test MAE: 0.2017
Epoch 200/500
  Train Loss (MSE): 0.0623, Train MAE: 0.2116
  Test Loss (MSE): 0.0574, Test MAE: 0.2019
Epoch 250/500
  Train Loss (MSE): 0.0623, Train MAE: 0.2115
  Test Loss (MSE): 0.0575, Test MAE: 0.2020
Epoch 300/500
  Train Loss (MSE): 0.0622, Train MAE: 0.2114
  Test Loss (MSE): 0.0576, Test MAE: 0.2022
Epoch 350/500
  Train Loss (MSE): 0.0622, Train MAE: 0.2113
  Test Loss (MSE): 0.0576, Test MAE: 0.2024
Epoch 400/500
  Train Loss (MSE): 0.0622, Train MAE: 0.2113
  Test Loss (MSE): 0.0577, Test MAE: 0.2025
Epoch 450/500
  Train Loss (MSE): 0.0622, Train MAE: 0.2112
  Test Loss (MSE): 0.0578, Test MAE: 0.2027
Epoch 500/500
  Train Loss (MSE): 0.0621, Train MAE: 0.2112
  Test Loss (MSE): 0.0579, Test MAE: 0.2028

Training complete!
[0.85 0.05 0.81 ... 0.39 0.46 0.49]

Conclusion: the Mean Absolute Error is around 20.¶

The MAE value itself indicates the average absolute error between predicted and actual values. The smaller the MAE, the better the model’s predictions align with the actual data. A MAE of 0 would mean a perfect prediction, but in most cases, achieving such perfection is unlikely. It’s important to compare the MAE to the scale of the target variable. Should I compare 20 to 0-1... if yes, this is a large error..

Experiment 6 - Using MLP classifier¶

Here is the code (similar to the code sample provided in class, but using a regressor)¶

In [15]:
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import numpy as np
import csv

# Define categorical mappings
JOB_TITLES = ['AI Engineer', 'Chef', 'Construction Worker', 'Customer Support', 
              'Data Scientist', 'Doctor', 'Financial Analyst', 'Graphic Designer', 
              'HR Specialist', 'Lawyer', 'Marketing Manager', 'Mechanic', 
              'Nurse', 'Research Scientist', 'Retail Worker', 'Security Guard', 
              'Software Engineer', 'Teacher', 'Truck Driver', 'UX Researcher']

EDUCATION_LEVELS = ["Bachelor's", 'High School', "Master's", 'PhD']

# Load and preprocess data
def load_data(filename, max_rows=5000, threshold=0.5):
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        rows = list(reader)[:max_rows]  # Limit data for memory
   
    # Select numerical features
    numerical_features = ['Average_Salary', 'Years_Experience', 
                         'AI_Exposure_Index', 'Tech_Growth_Factor', 'Skill_1',
                         'Skill_2', 'Skill_3', 'Skill_4',
                         'Skill_5', 'Skill_6', 'Skill_7',
                         'Skill_8', 'Skill_9', 'Skill_10']
   
    X = []
    y = []
    
    for row in rows:
        # Extract numerical features
        features = [float(row[name]) for name in numerical_features]        
        
        # One-hot encode Job_Title.   
        job_title = row['Job_Title']
        job_encoding = [1.0 if job_title == jt else 0.0 for jt in JOB_TITLES]
        
        # One-hot encode Education_Level
        education = row['Education_Level']
        edu_encoding = [1.0 if education == el else 0.0 for el in EDUCATION_LEVELS]
        
        # Combine all features
        all_features = features + job_encoding + edu_encoding
        X.append(all_features)              
       
        y.append(float(row['Automation_Probability_2030']))
   
    return np.array(X), np.array(y)

# Normalize features
def normalize(X):
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0) + 1e-8
    return (X - mean) / std


if __name__ == "__main__":
    # Load data
    print("Loading data...")
    X, y = load_data("datasets/AI_Impact_on_Jobs_2030.csv")
    print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")

    # Normalize data
    X_norm = normalize(X)
   
    # Split data (80/20)
    n_train = int(0.8 * len(X))
    xtrain, xtest = X_norm[:n_train], X_norm[n_train:]
    ytrain, ytest = y[:n_train], y[n_train:]
        
    print(f"read {xtrain.shape[1]} byte data records, {xtrain.shape[0]} training examples, {xtest.shape[0]} testing examples\n")

    regr = MLPRegressor(random_state=1, max_iter=2000, tol=0.1)
    regr.fit(xtrain, ytrain)
    predictions = regr.predict(xtest[:20])
    print(predictions)
    print(ytest[:20])
    
    print(f"\ntest score: {regr.score(xtest,ytest)}\n")
    
    
Loading data...
Dataset: 3000 samples, 38 features
read 38 byte data records, 2400 training examples, 600 testing examples

[0.58870038 0.34096235 1.04184036 0.35484229 0.14032916 0.53031915
 0.56390841 0.45442037 0.1585959  0.18552326 0.5477567  0.30784507
 0.62668054 0.49633547 0.70257281 0.36889893 0.07325958 1.01327444
 0.8125246  0.73722747]
[0.7  0.5  0.72 0.37 0.42 0.4  0.52 0.43 0.22 0.22 0.48 0.4  0.52 0.35
 0.34 0.74 0.69 0.74 0.36 0.88]

test score: 0.3363055580276779

Conclusion: 34% score..¶

In [ ]: