Week 4: machine learning - "AI impact on jobs by 2030" dataset¶
Context¶
- Source: Kaggle
- Description: this dataset simulates the future of work in the age of artificial intelligence. It models how various professions, skills, and education levels might be impacted by AI-driven automation by the year 2030.
Load dataset¶
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
df = pd.read_csv("datasets/AI_Impact_on_Jobs_2030.csv")
# 🧾 Display dataset informations
print("Dataset shape:", df.shape)
#print(df.info)
Dataset shape: (3000, 18)
Explore content¶
df.head()
| Job_Title | Average_Salary | Years_Experience | Education_Level | AI_Exposure_Index | Tech_Growth_Factor | Automation_Probability_2030 | Risk_Category | Skill_1 | Skill_2 | Skill_3 | Skill_4 | Skill_5 | Skill_6 | Skill_7 | Skill_8 | Skill_9 | Skill_10 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Security Guard | 45795 | 28 | Master's | 0.18 | 1.28 | 0.85 | High | 0.45 | 0.10 | 0.46 | 0.33 | 0.14 | 0.65 | 0.06 | 0.72 | 0.94 | 0.00 |
| 1 | Research Scientist | 133355 | 20 | PhD | 0.62 | 1.11 | 0.05 | Low | 0.02 | 0.52 | 0.40 | 0.05 | 0.97 | 0.23 | 0.09 | 0.62 | 0.38 | 0.98 |
| 2 | Construction Worker | 146216 | 2 | High School | 0.86 | 1.18 | 0.81 | High | 0.01 | 0.94 | 0.56 | 0.39 | 0.02 | 0.23 | 0.24 | 0.68 | 0.61 | 0.83 |
| 3 | Software Engineer | 136530 | 13 | PhD | 0.39 | 0.68 | 0.60 | Medium | 0.43 | 0.21 | 0.57 | 0.03 | 0.84 | 0.45 | 0.40 | 0.93 | 0.73 | 0.33 |
| 4 | Financial Analyst | 70397 | 22 | High School | 0.52 | 1.46 | 0.64 | Medium | 0.75 | 0.54 | 0.59 | 0.97 | 0.61 | 0.28 | 0.30 | 0.17 | 0.02 | 0.42 |
Experiment 1 - Let's try to update by myself the code generated by Claude AI for the bank loan dataset...¶
Here is the code¶
import jax
import jax.numpy as jnp
from jax import grad, jit
import csv
# Load and preprocess data
# [Philippe] Still the same question here: why do we get rid of the non-numerical features ??
def load_data(filename, max_rows=5000):
with open(filename, 'r') as f:
reader = csv.DictReader(f)
rows = list(reader)[:max_rows] # Limit data for memory
# Select numerical features
feature_names = ['Average_Salary', 'Years_Experience',
'AI_Exposure_Index', 'Tech_Growth_Factor', 'Skill_1',
'Skill_2', 'Skill_3', 'Skill_4',
'Skill_5', 'Skill_6', 'Skill_7',
'Skill_8', 'Skill_9', 'Skill_10']
X = []
y = []
for row in rows:
features = [float(row[name]) for name in feature_names]
X.append(features)
y.append(float(row['Automation_Probability_2030']))
return jnp.array(X), jnp.array(y)
# Normalize features
def normalize(X):
mean = jnp.mean(X, axis=0)
std = jnp.std(X, axis=0) + 1e-8
return (X - mean) / std
# Initialize network parameters
# [Philippe] In this code (when called), we will have 4 layers: 1 input (14 features => 14 neurons), 1 hidden (15 features, why one more ??), another hidden (8 neurons) and then an output with 1 neuron
def init_network(layer_sizes, key):
params = []
for i in range(len(layer_sizes) - 1):
key, subkey = jax.random.split(key)
w = jax.random.normal(subkey, (layer_sizes[i], layer_sizes[i+1])) * 0.1
b = jnp.zeros(layer_sizes[i+1])
params.append((w, b))
return params
# Forward pass
# [Philippe]
def forward(params, x):
for i, (w, b) in enumerate(params[:-1]):
x = jnp.tanh(jnp.dot(x, w) + b)
w, b = params[-1]
return jnp.dot(x, w) + b
# Sigmoid activation
def sigmoid(x):
return 1 / (1 + jnp.exp(-x))
# Binary cross-entropy loss
def loss_fn(params, x, y):
logits = forward(params, x)
probs = sigmoid(logits.squeeze())
return -jnp.mean(y * jnp.log(probs + 1e-8) + (1 - y) * jnp.log(1 - probs + 1e-8))
# Prediction function
def predict(params, x):
logits = forward(params, x)
return (sigmoid(logits.squeeze()) > 0.5).astype(jnp.float32)
# Training step
@jit
def train_step(params, x, y, lr):
loss, grads = jax.value_and_grad(loss_fn)(params, x, y)
params = [(w - lr * dw, b - lr * db) for (w, b), (dw, db) in zip(params, grads)]
return params, loss
# Main training loop
def train(X, y, layer_sizes=[14, 15, 8, 1], epochs=500, lr=0.01, batch_size=64):
key = jax.random.PRNGKey(42)
# Normalize data
X_norm = normalize(X)
# Split data (80/20)
n_train = int(0.8 * len(X))
X_train, X_test = X_norm[:n_train], X_norm[n_train:]
y_train, y_test = y[:n_train], y[n_train:]
# Initialize network
params = init_network(layer_sizes, key)
# Training loop
n_batches = len(X_train) // batch_size
for epoch in range(epochs):
# Shuffle data
key, subkey = jax.random.split(key)
perm = jax.random.permutation(subkey, len(X_train))
X_train_shuffled = X_train[perm]
y_train_shuffled = y_train[perm]
for i in range(n_batches):
start = i * batch_size
end = start + batch_size
X_batch = X_train_shuffled[start:end]
y_batch = y_train_shuffled[start:end]
params, batch_loss = train_step(params, X_batch, y_batch, lr)
if (epoch + 1) % 50 == 0:
train_loss = loss_fn(params, X_train, y_train)
test_loss = loss_fn(params, X_test, y_test)
train_preds = predict(params, X_train)
test_preds = predict(params, X_test)
train_acc = jnp.mean(train_preds == y_train)
test_acc = jnp.mean(test_preds == y_test)
print(f"Epoch {epoch+1}/{epochs}")
print(f" Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
print(f" Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
return params
if __name__ == "__main__":
# Load data
print("Loading data...")
X, y = load_data("datasets/AI_Impact_on_Jobs_2030.csv")
print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")
# Train model
print("\nTraining neural network...")
params = train(X, y)
print("\nTraining complete!")
Loading data... Dataset: 3000 samples, 14 features Training neural network... Epoch 50/500 Train Loss: 0.6930, Train Acc: 0.0000 Test Loss: 0.6937, Test Acc: 0.0000 Epoch 100/500 Train Loss: 0.6927, Train Acc: 0.0000 Test Loss: 0.6940, Test Acc: 0.0000 Epoch 150/500 Train Loss: 0.6925, Train Acc: 0.0000 Test Loss: 0.6944, Test Acc: 0.0000 Epoch 200/500 Train Loss: 0.6924, Train Acc: 0.0000 Test Loss: 0.6947, Test Acc: 0.0000 Epoch 250/500 Train Loss: 0.6923, Train Acc: 0.0000 Test Loss: 0.6950, Test Acc: 0.0000 Epoch 300/500 Train Loss: 0.6922, Train Acc: 0.0000 Test Loss: 0.6952, Test Acc: 0.0000 Epoch 350/500 Train Loss: 0.6922, Train Acc: 0.0000 Test Loss: 0.6954, Test Acc: 0.0000 Epoch 400/500 Train Loss: 0.6921, Train Acc: 0.0000 Test Loss: 0.6955, Test Acc: 0.0000 Epoch 450/500 Train Loss: 0.6921, Train Acc: 0.0000 Test Loss: 0.6956, Test Acc: 0.0000 Epoch 500/500 Train Loss: 0.6921, Train Acc: 0.0000 Test Loss: 0.6956, Test Acc: 0.0000 Training complete!
Conclusion: it runs but the results are disappointing...¶
Me: "So, AI, what's wrong with my code ?"
AI: "Missing data binarization: Your loss function uses binary cross-entropy, but Automation_Probability_2030 in your CSV is likely a continuous value (0.0 to 1.0), not binary (0 or 1). You need to convert it to binary labels"
Me: "What??? The output in the dataset is a probability. You just ignore it by replacing it by a binary value ! Anyway, let's have a look on your suggestion:
Experiment 2- switching from probabitity to boolean¶
Here is the code¶
import jax
import jax.numpy as jnp
from jax import grad, jit
import csv
# Load and preprocess data
def load_data(filename, max_rows=5000, threshold=0.5):
with open(filename, 'r') as f:
reader = csv.DictReader(f)
rows = list(reader)[:max_rows] # Limit data for memory
# Select numerical features
feature_names = ['Average_Salary', 'Years_Experience',
'AI_Exposure_Index', 'Tech_Growth_Factor', 'Skill_1',
'Skill_2', 'Skill_3', 'Skill_4',
'Skill_5', 'Skill_6', 'Skill_7',
'Skill_8', 'Skill_9', 'Skill_10']
X = []
y = []
for row in rows:
features = [float(row[name]) for name in feature_names]
X.append(features)
# Convert probability to binary label ====================== this is the new code
prob = float(row['Automation_Probability_2030'])
y.append(1.0 if prob >= threshold else 0.0)
return jnp.array(X), jnp.array(y)
# Normalize features
def normalize(X):
mean = jnp.mean(X, axis=0)
std = jnp.std(X, axis=0) + 1e-8
return (X - mean) / std
# Initialize network parameters
def init_network(layer_sizes, key):
params = []
for i in range(len(layer_sizes) - 1):
key, subkey = jax.random.split(key)
w = jax.random.normal(subkey, (layer_sizes[i], layer_sizes[i+1])) * 0.1
b = jnp.zeros(layer_sizes[i+1])
params.append((w, b))
return params
# Forward pass
def forward(params, x):
for i, (w, b) in enumerate(params[:-1]):
x = jnp.tanh(jnp.dot(x, w) + b)
w, b = params[-1]
return jnp.dot(x, w) + b
# Sigmoid activation
def sigmoid(x):
return 1 / (1 + jnp.exp(-x))
# Binary cross-entropy loss
def loss_fn(params, x, y):
logits = forward(params, x)
probs = sigmoid(logits.squeeze())
return -jnp.mean(y * jnp.log(probs + 1e-8) + (1 - y) * jnp.log(1 - probs + 1e-8))
# Prediction function
def predict(params, x):
logits = forward(params, x)
return (sigmoid(logits.squeeze()) > 0.5).astype(jnp.float32)
# Training step
@jit
def train_step(params, x, y, lr):
loss, grads = jax.value_and_grad(loss_fn)(params, x, y)
params = [(w - lr * dw, b - lr * db) for (w, b), (dw, db) in zip(params, grads)]
return params, loss
# Main training loop
def train(X, y, layer_sizes=[14, 8, 1], epochs=500, lr=0.01, batch_size=64):
key = jax.random.PRNGKey(42)
# Normalize data
X_norm = normalize(X)
# Split data (80/20)
n_train = int(0.8 * len(X))
X_train, X_test = X_norm[:n_train], X_norm[n_train:]
y_train, y_test = y[:n_train], y[n_train:]
# Initialize network
params = init_network(layer_sizes, key)
# Training loop
n_batches = len(X_train) // batch_size
for epoch in range(epochs):
# Shuffle data
key, subkey = jax.random.split(key)
perm = jax.random.permutation(subkey, len(X_train))
X_train_shuffled = X_train[perm]
y_train_shuffled = y_train[perm]
for i in range(n_batches):
start = i * batch_size
end = start + batch_size
X_batch = X_train_shuffled[start:end]
y_batch = y_train_shuffled[start:end]
params, batch_loss = train_step(params, X_batch, y_batch, lr)
if (epoch + 1) % 50 == 0:
train_loss = loss_fn(params, X_train, y_train)
test_loss = loss_fn(params, X_test, y_test)
train_preds = predict(params, X_train)
test_preds = predict(params, X_test)
train_acc = jnp.mean(train_preds == y_train)
test_acc = jnp.mean(test_preds == y_test)
print(f"Epoch {epoch+1}/{epochs}")
print(f" Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
print(f" Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
return params
if __name__ == "__main__":
# Load data
print("Loading data...")
X, y = load_data("datasets/AI_Impact_on_Jobs_2030.csv")
print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")
# Train model
print("\nTraining neural network...")
params = train(X, y)
print("\nTraining complete!")
Loading data... Dataset: 3000 samples, 14 features Training neural network... Epoch 50/500 Train Loss: 0.6914, Train Acc: 0.5296 Test Loss: 0.6963, Test Acc: 0.4833 Epoch 100/500 Train Loss: 0.6902, Train Acc: 0.5258 Test Loss: 0.6985, Test Acc: 0.4850 Epoch 150/500 Train Loss: 0.6896, Train Acc: 0.5300 Test Loss: 0.7002, Test Acc: 0.4817 Epoch 200/500 Train Loss: 0.6893, Train Acc: 0.5358 Test Loss: 0.7015, Test Acc: 0.4750 Epoch 250/500 Train Loss: 0.6892, Train Acc: 0.5350 Test Loss: 0.7022, Test Acc: 0.4733 Epoch 300/500 Train Loss: 0.6891, Train Acc: 0.5383 Test Loss: 0.7027, Test Acc: 0.4667 Epoch 350/500 Train Loss: 0.6890, Train Acc: 0.5375 Test Loss: 0.7028, Test Acc: 0.4683 Epoch 400/500 Train Loss: 0.6889, Train Acc: 0.5387 Test Loss: 0.7031, Test Acc: 0.4683 Epoch 450/500 Train Loss: 0.6889, Train Acc: 0.5396 Test Loss: 0.7034, Test Acc: 0.4683 Epoch 500/500 Train Loss: 0.6888, Train Acc: 0.5379 Test Loss: 0.7035, Test Acc: 0.4683 Training complete!
Conclusion : better but ... a monkey would do something similar...(53% accuracy). Let's do something to add two important missing features: job title and education level.¶
Me: "AI, add the job title and the education level as features to the code.
AI: "Here it is.."
Experiment 3 - dealing with categories¶
What are the available strategies ?¶
Label Encoding: assigns a unique integer to each category.
Use case: Suitable for ordinal categorical features where there's a meaningful order (e.g., "low," "medium," "high").
Caution: Can introduce artificial order for nominal data, which might negatively impact models expecting independent features.One-Hot Encoding: creates new binary columns for each category, indicating presence (1) or absence (0).
Use case: Ideal for nominal categorical features where no inherent order exists (e.g., "red," "blue," "green").
Caution: Can lead to high dimensionality ("curse of dimensionality") for features with many unique categories (high cardinality).Target Encoding (Mean Encoding): replaces categories with the mean of the target variable for that category.
Use case: Effective for high-cardinality features, as it reduces dimensionality while capturing predictive information.
Caution: Prone to data leakage if not implemented carefully (e.g., calculating means on the entire dataset before splitting into train/test). Use cross-validation or calculate means only on the training set.Binary Encoding: converts categories to binary code, then creates new columns for each bit in the binary representation.
Use case: A compromise between Label and One-Hot Encoding, reducing dimensionality compared to One-Hot while still providing a numerical representation.
Feature Hashing (Hashing Trick): maps categories to a fixed-size vector using a hash function.
Use case: Efficient for very high-cardinality features and when memory is a concern.
Caution: Can lead to collisions (different categories mapping to the same hash value), potentially impacting model accuracy.
Here is the code (One-Hot Encoding strategy)¶
import jax
import jax.numpy as jnp
from jax import grad, jit
import csv
# Define categorical mappings
JOB_TITLES = ['AI Engineer', 'Chef', 'Construction Worker', 'Customer Support',
'Data Scientist', 'Doctor', 'Financial Analyst', 'Graphic Designer',
'HR Specialist', 'Lawyer', 'Marketing Manager', 'Mechanic',
'Nurse', 'Research Scientist', 'Retail Worker', 'Security Guard',
'Software Engineer', 'Teacher', 'Truck Driver', 'UX Researcher']
EDUCATION_LEVELS = ["Bachelor's", 'High School', "Master's", 'PhD']
# Load and preprocess data
def load_data(filename, max_rows=5000, threshold=0.5):
with open(filename, 'r') as f:
reader = csv.DictReader(f)
rows = list(reader)[:max_rows] # Limit data for memory
# Select numerical features
numerical_features = ['Average_Salary', 'Years_Experience',
'AI_Exposure_Index', 'Tech_Growth_Factor', 'Skill_1',
'Skill_2', 'Skill_3', 'Skill_4',
'Skill_5', 'Skill_6', 'Skill_7',
'Skill_8', 'Skill_9', 'Skill_10']
X = []
y = []
for row in rows:
# Extract numerical features
features = [float(row[name]) for name in numerical_features]
# [Philippe] This is the added code : since both fields are categories, we need to one-hot encode them
# We now have 38 total features (14 numerical + 20 job categories + 4 education categories)
# One-hot encode Job_Title.
job_title = row['Job_Title']
job_encoding = [1.0 if job_title == jt else 0.0 for jt in JOB_TITLES]
# One-hot encode Education_Level
education = row['Education_Level']
edu_encoding = [1.0 if education == el else 0.0 for el in EDUCATION_LEVELS]
# Combine all features
all_features = features + job_encoding + edu_encoding
X.append(all_features)
# Convert probability to binary label
prob = float(row['Automation_Probability_2030'])
y.append(1.0 if prob >= threshold else 0.0)
return jnp.array(X), jnp.array(y)
# Normalize features
def normalize(X):
mean = jnp.mean(X, axis=0)
std = jnp.std(X, axis=0) + 1e-8
return (X - mean) / std
# Initialize network parameters
def init_network(layer_sizes, key):
params = []
for i in range(len(layer_sizes) - 1):
key, subkey = jax.random.split(key)
w = jax.random.normal(subkey, (layer_sizes[i], layer_sizes[i+1])) * 0.1
b = jnp.zeros(layer_sizes[i+1])
params.append((w, b))
return params
# Forward pass
def forward(params, x):
for i, (w, b) in enumerate(params[:-1]):
x = jnp.tanh(jnp.dot(x, w) + b)
w, b = params[-1]
return jnp.dot(x, w) + b
# Sigmoid activation
def sigmoid(x):
return 1 / (1 + jnp.exp(-x))
# Binary cross-entropy loss
def loss_fn(params, x, y):
logits = forward(params, x)
probs = sigmoid(logits.squeeze())
return -jnp.mean(y * jnp.log(probs + 1e-8) + (1 - y) * jnp.log(1 - probs + 1e-8))
# Prediction function
def predict(params, x):
logits = forward(params, x)
return (sigmoid(logits.squeeze()) > 0.5).astype(jnp.float32)
# Training step
@jit
def train_step(params, x, y, lr):
loss, grads = jax.value_and_grad(loss_fn)(params, x, y)
params = [(w - lr * dw, b - lr * db) for (w, b), (dw, db) in zip(params, grads)]
return params, loss
# Main training loop
def train(X, y, layer_sizes=[38, 24, 12, 1], epochs=500, lr=0.01, batch_size=64):
key = jax.random.PRNGKey(42)
# Normalize data
X_norm = normalize(X)
# Split data (80/20)
n_train = int(0.8 * len(X))
X_train, X_test = X_norm[:n_train], X_norm[n_train:]
y_train, y_test = y[:n_train], y[n_train:]
# Initialize network
params = init_network(layer_sizes, key)
# Training loop
n_batches = len(X_train) // batch_size
for epoch in range(epochs):
# Shuffle data
key, subkey = jax.random.split(key)
perm = jax.random.permutation(subkey, len(X_train))
X_train_shuffled = X_train[perm]
y_train_shuffled = y_train[perm]
for i in range(n_batches):
start = i * batch_size
end = start + batch_size
X_batch = X_train_shuffled[start:end]
y_batch = y_train_shuffled[start:end]
params, batch_loss = train_step(params, X_batch, y_batch, lr)
if (epoch + 1) % 50 == 0:
train_loss = loss_fn(params, X_train, y_train)
test_loss = loss_fn(params, X_test, y_test)
train_preds = predict(params, X_train)
test_preds = predict(params, X_test)
train_acc = jnp.mean(train_preds == y_train)
test_acc = jnp.mean(test_preds == y_test)
print(f"Epoch {epoch+1}/{epochs}")
print(f" Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
print(f" Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
return params
if __name__ == "__main__":
# Load data
print("Loading data...")
X, y = load_data("datasets/AI_Impact_on_Jobs_2030.csv")
print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")
print(f"Feature breakdown:")
print(f" - 14 numerical features")
print(f" - 20 job title categories (one-hot encoded)")
print(f" - 4 education level categories (one-hot encoded)")
print(f" - Total: 38 features")
# Train model
print("\nTraining neural network...")
params = train(X, y)
print("\nTraining complete!")
Loading data... Dataset: 3000 samples, 38 features Feature breakdown: - 14 numerical features - 20 job title categories (one-hot encoded) - 4 education level categories (one-hot encoded) - Total: 38 features Training neural network... Epoch 50/500 Train Loss: 0.3541, Train Acc: 0.7925 Test Loss: 0.3874, Test Acc: 0.7483 Epoch 100/500 Train Loss: 0.3342, Train Acc: 0.8117 Test Loss: 0.3759, Test Acc: 0.7333 Epoch 150/500 Train Loss: 0.3175, Train Acc: 0.8317 Test Loss: 0.3754, Test Acc: 0.7533 Epoch 200/500 Train Loss: 0.2995, Train Acc: 0.8492 Test Loss: 0.3886, Test Acc: 0.7517 Epoch 250/500 Train Loss: 0.2798, Train Acc: 0.8713 Test Loss: 0.4066, Test Acc: 0.7483 Epoch 300/500 Train Loss: 0.2553, Train Acc: 0.8846 Test Loss: 0.4264, Test Acc: 0.7533 Epoch 350/500 Train Loss: 0.2269, Train Acc: 0.9004 Test Loss: 0.4545, Test Acc: 0.7567 Epoch 400/500 Train Loss: 0.1980, Train Acc: 0.9188 Test Loss: 0.4982, Test Acc: 0.7483 Epoch 450/500 Train Loss: 0.1700, Train Acc: 0.9367 Test Loss: 0.5620, Test Acc: 0.7550 Epoch 500/500 Train Loss: 0.1428, Train Acc: 0.9508 Test Loss: 0.6325, Test Acc: 0.7600 Training complete!
Nice! we have a test accuracy of 76%..... but I see two issues:¶
- why is there such a large gap between the training accuracy and the test accuracy ?
- why do we have so many additionnal features ? job title is a category, you can (most of the time) not be a computer scientist AND a plumber AND a nurse at the same time. Why do we need a feature for each job title value..
AI answer to the first issue: the large gap between 95% training accuracy and 76% test accuracy is a classic sign of overfitting. Here are the main causes and solutions:
- Overfitting (Most Likely). Your model is memorizing the training data rather than learning generalizable patterns. Signs in your results:
- Training loss keeps decreasing (0.35 → 0.14)
- Test loss actually increases (0.39 → 0.63)
- This divergence is a textbook overfitting pattern
- Model Complexity
- You have 38 input features for only 2,400 training samples
- The network has 3 hidden layers [38 → 24 → 12 → 1]
- This gives the model enough capacity to memorize training examples
- Small Dataset
- 2,400 training samples / 38 features = ~63 samples per feature
- This ratio is relatively low for deep learning
Let me create an improved version with regularization techniques:
Me: "The problem now: I have no clue whether the AI is really improving things or just hallucinating..."
Experiment 4 - adding regularization¶
Here is the code¶
import jax
import jax.numpy as jnp
from jax import grad, jit, random
from functools import partial
import csv
# Define categorical mappings
JOB_TITLES = ['AI Engineer', 'Chef', 'Construction Worker', 'Customer Support',
'Data Scientist', 'Doctor', 'Financial Analyst', 'Graphic Designer',
'HR Specialist', 'Lawyer', 'Marketing Manager', 'Mechanic',
'Nurse', 'Research Scientist', 'Retail Worker', 'Security Guard',
'Software Engineer', 'Teacher', 'Truck Driver', 'UX Researcher']
EDUCATION_LEVELS = ["Bachelor's", 'High School', "Master's", 'PhD']
# Load and preprocess data
def load_data(filename, max_rows=5000, threshold=0.5):
with open(filename, 'r') as f:
reader = csv.DictReader(f)
rows = list(reader)[:max_rows]
numerical_features = ['Average_Salary', 'Years_Experience',
'AI_Exposure_Index', 'Tech_Growth_Factor', 'Skill_1',
'Skill_2', 'Skill_3', 'Skill_4',
'Skill_5', 'Skill_6', 'Skill_7',
'Skill_8', 'Skill_9', 'Skill_10']
X = []
y = []
for row in rows:
features = [float(row[name]) for name in numerical_features]
job_title = row['Job_Title']
job_encoding = [1.0 if job_title == jt else 0.0 for jt in JOB_TITLES]
education = row['Education_Level']
edu_encoding = [1.0 if education == el else 0.0 for el in EDUCATION_LEVELS]
all_features = features + job_encoding + edu_encoding
X.append(all_features)
prob = float(row['Automation_Probability_2030'])
y.append(1.0 if prob >= threshold else 0.0)
return jnp.array(X), jnp.array(y)
# Normalize features
def normalize(X):
mean = jnp.mean(X, axis=0)
std = jnp.std(X, axis=0) + 1e-8
return (X - mean) / std, mean, std
# Initialize network parameters with Xavier initialization
def init_network(layer_sizes, key):
params = []
for i in range(len(layer_sizes) - 1):
key, subkey = random.split(key)
scale = jnp.sqrt(2.0 / (layer_sizes[i] + layer_sizes[i+1]))
w = random.normal(subkey, (layer_sizes[i], layer_sizes[i+1])) * scale
b = jnp.zeros(layer_sizes[i+1])
params.append((w, b))
return params
# Forward pass
def forward(params, x):
for i, (w, b) in enumerate(params[:-1]):
x = jnp.tanh(jnp.dot(x, w) + b)
w, b = params[-1]
return jnp.dot(x, w) + b
# Sigmoid activation
def sigmoid(x):
return 1 / (1 + jnp.exp(-jnp.clip(x, -20, 20)))
# Binary cross-entropy loss with L2 regularization
def loss_fn(params, x, y, l2_reg):
logits = forward(params, x)
probs = sigmoid(logits.squeeze())
# Cross-entropy loss
ce_loss = -jnp.mean(y * jnp.log(probs + 1e-8) + (1 - y) * jnp.log(1 - probs + 1e-8))
# L2 regularization (weight decay)
l2_loss = 0.0
for w, b in params:
l2_loss += jnp.sum(w ** 2)
return ce_loss + l2_reg * l2_loss
# Prediction function
def predict(params, x):
logits = forward(params, x)
return (sigmoid(logits.squeeze()) > 0.5).astype(jnp.float32)
# Training step
@jit
def train_step(params, x, y, lr, l2_reg):
loss, grads = jax.value_and_grad(loss_fn)(params, x, y, l2_reg)
params = [(w - lr * dw, b - lr * db) for (w, b), (dw, db) in zip(params, grads)]
return params, loss
# Evaluation loss (no regularization for reporting)
def eval_loss(params, x, y):
logits = forward(params, x)
probs = sigmoid(logits.squeeze())
return -jnp.mean(y * jnp.log(probs + 1e-8) + (1 - y) * jnp.log(1 - probs + 1e-8))
# Main training loop with early stopping
def train(X, y, layer_sizes=[38, 16, 1], epochs=500, lr=0.01, batch_size=64,
l2_reg=0.01, patience=150):
key = random.PRNGKey(42)
# Normalize data
X_norm, mean, std = normalize(X)
# Split data (80/20)
n_train = int(0.8 * len(X))
X_train, X_test = X_norm[:n_train], X_norm[n_train:]
y_train, y_test = y[:n_train], y[n_train:]
# Initialize network
params = init_network(layer_sizes, key)
# Early stopping variables
best_test_loss = float('inf')
best_params = params
patience_counter = 0
# Training loop
n_batches = len(X_train) // batch_size
print(f"\nTraining Configuration:")
print(f" Network: {layer_sizes}")
print(f" L2 Regularization: {l2_reg}")
print(f" Learning Rate: {lr}")
print(f" Batch Size: {batch_size}")
print(f" Early Stopping Patience: {patience}\n")
for epoch in range(epochs):
# Shuffle data
key, subkey = random.split(key)
perm = random.permutation(subkey, len(X_train))
X_train_shuffled = X_train[perm]
y_train_shuffled = y_train[perm]
for i in range(n_batches):
start = i * batch_size
end = start + batch_size
X_batch = X_train_shuffled[start:end]
y_batch = y_train_shuffled[start:end]
params, batch_loss = train_step(params, X_batch, y_batch, lr, l2_reg)
if (epoch + 1) % 50 == 0:
# Evaluate
train_loss = eval_loss(params, X_train, y_train)
test_loss = eval_loss(params, X_test, y_test)
train_preds = predict(params, X_train)
test_preds = predict(params, X_test)
train_acc = jnp.mean(train_preds == y_train)
test_acc = jnp.mean(test_preds == y_test)
print(f"Epoch {epoch+1}/{epochs}")
print(f" Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
print(f" Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
print(f" Accuracy Gap: {train_acc - test_acc:.4f}")
# Early stopping check
if test_loss < best_test_loss:
best_test_loss = test_loss
best_params = params
patience_counter = 0
else:
patience_counter += 50
if patience_counter >= patience:
print(f"\nEarly stopping at epoch {epoch+1}")
print(f"Best test loss: {best_test_loss:.4f}")
break
return best_params
if __name__ == "__main__":
print("="*70)
print("ADDRESSING OVERFITTING: COMPARING DIFFERENT REGULARIZATION STRATEGIES")
print("="*70)
# Load data
print("\nLoading data...")
X, y = load_data("datasets/AI_Impact_on_Jobs_2030.csv")
print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")
print(f"\nProblem: Original model had 95% train accuracy but only 76% test accuracy")
print("This 19% gap indicates OVERFITTING - the model memorized training data\n")
print("\n" + "="*70)
print("EXPERIMENT 1: Original Architecture (Baseline)")
print("="*70)
print("Strategy: No regularization, deeper network [38, 24, 12, 1]")
params1 = train(X, y, layer_sizes=[38, 24, 12, 1], epochs=500, lr=0.01,
l2_reg=0.0, patience=150)
print("\n" + "="*70)
print("EXPERIMENT 2: Simpler Architecture")
print("="*70)
print("Strategy: Reduce model complexity [38, 16, 1]")
params2 = train(X, y, layer_sizes=[38, 16, 1], epochs=500, lr=0.01,
l2_reg=0.0, patience=150)
print("\n" + "="*70)
print("EXPERIMENT 3: L2 Regularization (Weak)")
print("="*70)
print("Strategy: Add weak L2 penalty (0.001) to prevent large weights")
params3 = train(X, y, layer_sizes=[38, 16, 1], epochs=500, lr=0.01,
l2_reg=0.001, patience=150)
print("\n" + "="*70)
print("EXPERIMENT 4: L2 Regularization (Strong)")
print("="*70)
print("Strategy: Add stronger L2 penalty (0.01) to enforce weight decay")
params4 = train(X, y, layer_sizes=[38, 16, 1], epochs=500, lr=0.01,
l2_reg=0.01, patience=150)
print("\n" + "="*70)
print("SUMMARY & RECOMMENDATIONS")
print("="*70)
====================================================================== ADDRESSING OVERFITTING: COMPARING DIFFERENT REGULARIZATION STRATEGIES ====================================================================== Loading data... Dataset: 3000 samples, 38 features Problem: Original model had 95% train accuracy but only 76% test accuracy This 19% gap indicates OVERFITTING - the model memorized training data ====================================================================== EXPERIMENT 1: Original Architecture (Baseline) ====================================================================== Strategy: No regularization, deeper network [38, 24, 12, 1] Training Configuration: Network: [38, 24, 12, 1] L2 Regularization: 0.0 Learning Rate: 0.01 Batch Size: 64 Early Stopping Patience: 150 Epoch 50/500 Train Loss: 0.3419, Train Acc: 0.8100 Test Loss: 0.3874, Test Acc: 0.7583 Accuracy Gap: 0.0517 Epoch 100/500 Train Loss: 0.3138, Train Acc: 0.8408 Test Loss: 0.3822, Test Acc: 0.7600 Accuracy Gap: 0.0808 Epoch 150/500 Train Loss: 0.2921, Train Acc: 0.8600 Test Loss: 0.3913, Test Acc: 0.7467 Accuracy Gap: 0.1133 Epoch 200/500 Train Loss: 0.2704, Train Acc: 0.8804 Test Loss: 0.4129, Test Acc: 0.7383 Accuracy Gap: 0.1421 Epoch 250/500 Train Loss: 0.2487, Train Acc: 0.8892 Test Loss: 0.4417, Test Acc: 0.7450 Accuracy Gap: 0.1442 Early stopping at epoch 250 Best test loss: 0.3822 ====================================================================== EXPERIMENT 2: Simpler Architecture ====================================================================== Strategy: Reduce model complexity [38, 16, 1] Training Configuration: Network: [38, 16, 1] L2 Regularization: 0.0 Learning Rate: 0.01 Batch Size: 64 Early Stopping Patience: 150 Epoch 50/500 Train Loss: 0.3571, Train Acc: 0.7846 Test Loss: 0.3850, Test Acc: 0.7500 Accuracy Gap: 0.0346 Epoch 100/500 Train Loss: 0.3372, Train Acc: 0.7983 Test Loss: 0.3755, Test Acc: 0.7550 Accuracy Gap: 0.0433 Epoch 150/500 Train Loss: 0.3259, Train Acc: 0.8175 Test Loss: 0.3745, Test Acc: 0.7533 Accuracy Gap: 0.0642 Epoch 200/500 Train Loss: 0.3171, Train Acc: 0.8233 Test Loss: 0.3759, Test Acc: 0.7483 Accuracy Gap: 0.0750 Epoch 250/500 Train Loss: 0.3089, Train Acc: 0.8354 Test Loss: 0.3785, Test Acc: 0.7450 Accuracy Gap: 0.0904 Epoch 300/500 Train Loss: 0.3006, Train Acc: 0.8471 Test Loss: 0.3807, Test Acc: 0.7450 Accuracy Gap: 0.1021 Early stopping at epoch 300 Best test loss: 0.3745 ====================================================================== EXPERIMENT 3: L2 Regularization (Weak) ====================================================================== Strategy: Add weak L2 penalty (0.001) to prevent large weights Training Configuration: Network: [38, 16, 1] L2 Regularization: 0.001 Learning Rate: 0.01 Batch Size: 64 Early Stopping Patience: 150 Epoch 50/500 Train Loss: 0.3578, Train Acc: 0.7846 Test Loss: 0.3858, Test Acc: 0.7483 Accuracy Gap: 0.0362 Epoch 100/500 Train Loss: 0.3387, Train Acc: 0.7975 Test Loss: 0.3761, Test Acc: 0.7533 Accuracy Gap: 0.0442 Epoch 150/500 Train Loss: 0.3289, Train Acc: 0.8179 Test Loss: 0.3744, Test Acc: 0.7567 Accuracy Gap: 0.0613 Epoch 200/500 Train Loss: 0.3217, Train Acc: 0.8296 Test Loss: 0.3750, Test Acc: 0.7517 Accuracy Gap: 0.0779 Epoch 250/500 Train Loss: 0.3155, Train Acc: 0.8354 Test Loss: 0.3765, Test Acc: 0.7500 Accuracy Gap: 0.0854 Epoch 300/500 Train Loss: 0.3095, Train Acc: 0.8421 Test Loss: 0.3774, Test Acc: 0.7483 Accuracy Gap: 0.0938 Early stopping at epoch 300 Best test loss: 0.3744 ====================================================================== EXPERIMENT 4: L2 Regularization (Strong) ====================================================================== Strategy: Add stronger L2 penalty (0.01) to enforce weight decay Training Configuration: Network: [38, 16, 1] L2 Regularization: 0.01 Learning Rate: 0.01 Batch Size: 64 Early Stopping Patience: 150 Epoch 50/500 Train Loss: 0.3667, Train Acc: 0.7846 Test Loss: 0.3950, Test Acc: 0.7500 Accuracy Gap: 0.0346 Epoch 100/500 Train Loss: 0.3565, Train Acc: 0.7912 Test Loss: 0.3878, Test Acc: 0.7483 Accuracy Gap: 0.0429 Epoch 150/500 Train Loss: 0.3558, Train Acc: 0.7875 Test Loss: 0.3869, Test Acc: 0.7433 Accuracy Gap: 0.0442 Epoch 200/500 Train Loss: 0.3563, Train Acc: 0.7846 Test Loss: 0.3870, Test Acc: 0.7483 Accuracy Gap: 0.0362 Epoch 250/500 Train Loss: 0.3567, Train Acc: 0.7867 Test Loss: 0.3871, Test Acc: 0.7383 Accuracy Gap: 0.0483 Epoch 300/500 Train Loss: 0.3570, Train Acc: 0.7837 Test Loss: 0.3867, Test Acc: 0.7400 Accuracy Gap: 0.0437 Epoch 350/500 Train Loss: 0.3571, Train Acc: 0.7842 Test Loss: 0.3871, Test Acc: 0.7400 Accuracy Gap: 0.0442 Epoch 400/500 Train Loss: 0.3571, Train Acc: 0.7837 Test Loss: 0.3870, Test Acc: 0.7417 Accuracy Gap: 0.0421 Epoch 450/500 Train Loss: 0.3571, Train Acc: 0.7854 Test Loss: 0.3868, Test Acc: 0.7300 Accuracy Gap: 0.0554 Early stopping at epoch 450 Best test loss: 0.3867 ====================================================================== SUMMARY & RECOMMENDATIONS ======================================================================
Overfitting Causes Identified:
- Model complexity: 3 hidden layers with many parameters
- Small dataset: Only ~63 samples per feature
- No regularization: Model free to memorize patterns
- Training too long: 500 epochs without early stopping
Best Practices to Reduce Overfitting:
- ✓ Simplify architecture: Use fewer/smaller hidden layers
- ✓ Add L2 regularization: Penalize large weights (try 0.001-0.01)
- ✓ Use early stopping: Stop when test loss stops improving
- Consider dropout: Randomly disable neurons during training
- Get more data: Increase training samples if possible
- Feature selection: Remove irrelevant features
Expected Results:
- Simpler models: Smaller train-test gap (5-10% instead of 19%)
- L2 regularization: Test accuracy should improve by 2-5%
- Early stopping: Prevents overfitting in later epochs """)
The key insight: your 95% training accuracy was actually a warning sign, not a success! A simpler model that gets 82% on both train and test is much better than one that gets 95% train and 76% test.
Conclusion: accuracy is around 73%. I still see an issue with the yes/no output to replace a probability¶
Experiment 5 - it is a regression problem, not a classification problem !¶
Me: "AI, in this code (the one provided initialy, without the two features added later), the loss function uses binary cross-entropy, but Automation_Probability_2030 in the CSV is likely a continuous value (0.0 to 1.0), What loss function should i use instead ?
AI: "If Automation_Probability_2030 is a continuous value between 0.0 and 1.0 (which makes sense for a probability), this is a regression problem, not a binary classification problem. Here are the changes you should make:
- Use Mean Squared Error (MSE) loss instead
- Modify the output to use sigmoid activation:
- Update the prediction function for regression
- Change accuracy metrics to regression metrics. Replace accuracy with Mean Absolute Error (MAE) or R² score"
Alternative: If you prefer treating this as classification (high/low automation risk), you could threshold the continuous values at 0.5 and keep binary cross-entropy, but MSE is more appropriate for continuous targets.
Here is the code (note: only numeric values are taken into account in this example)¶
import jax
import jax.numpy as jnp
from jax import grad, jit
import csv
# Load and preprocess data
# [Philippe] Still the same question here: why do we get rid of the non-numerical features ??
def load_data(filename, max_rows=5000):
with open(filename, 'r') as f:
reader = csv.DictReader(f)
rows = list(reader)[:max_rows] # Limit data for memory
# Select numerical features
feature_names = ['Average_Salary', 'Years_Experience',
'AI_Exposure_Index', 'Tech_Growth_Factor', 'Skill_1',
'Skill_2', 'Skill_3', 'Skill_4',
'Skill_5', 'Skill_6', 'Skill_7',
'Skill_8', 'Skill_9', 'Skill_10']
X = []
y = []
for row in rows:
features = [float(row[name]) for name in feature_names]
X.append(features)
y.append(float(row['Automation_Probability_2030']))
return jnp.array(X), jnp.array(y)
# Normalize features
def normalize(X):
mean = jnp.mean(X, axis=0)
std = jnp.std(X, axis=0) + 1e-8
return (X - mean) / std
# Initialize network parameters
# [Philippe] In this code (when called), we will have 4 layers: 1 input (14 features => 14 neurons), 1 hidden (15 features, why one more ??), another hidden (8 neurons) and then an output with 1 neuron
def init_network(layer_sizes, key):
params = []
for i in range(len(layer_sizes) - 1):
key, subkey = jax.random.split(key)
w = jax.random.normal(subkey, (layer_sizes[i], layer_sizes[i+1])) * 0.1
b = jnp.zeros(layer_sizes[i+1])
params.append((w, b))
return params
# Forward pass with sigmoid on output
def forward(params, x):
for i, (w, b) in enumerate(params[:-1]):
x = jnp.tanh(jnp.dot(x, w) + b)
w, b = params[-1]
logits = jnp.dot(x, w) + b
return sigmoid(logits) # Add sigmoid to bound output to [0, 1]
# Sigmoid activation
def sigmoid(x):
return 1 / (1 + jnp.exp(-x))
# MSE loss for regression
def loss_fn(params, x, y):
preds = forward(params, x).squeeze()
return jnp.mean((preds - y) ** 2)
# Prediction function (no thresholding needed for regression)
def predict(params, x):
return forward(params, x).squeeze()
# Training step
@jit
def train_step(params, x, y, lr):
loss, grads = jax.value_and_grad(loss_fn)(params, x, y)
params = [(w - lr * dw, b - lr * db) for (w, b), (dw, db) in zip(params, grads)]
return params, loss
# Main training loop
def train(X, y, layer_sizes=[14, 15, 8, 1], epochs=500, lr=0.01, batch_size=64):
key = jax.random.PRNGKey(42)
# Normalize data
X_norm = normalize(X)
# Split data (80/20)
n_train = int(0.8 * len(X))
X_train, X_test = X_norm[:n_train], X_norm[n_train:]
y_train, y_test = y[:n_train], y[n_train:]
# Initialize network
params = init_network(layer_sizes, key)
# Training loop
n_batches = len(X_train) // batch_size
for epoch in range(epochs):
# Shuffle data
key, subkey = jax.random.split(key)
perm = jax.random.permutation(subkey, len(X_train))
X_train_shuffled = X_train[perm]
y_train_shuffled = y_train[perm]
for i in range(n_batches):
start = i * batch_size
end = start + batch_size
X_batch = X_train_shuffled[start:end]
y_batch = y_train_shuffled[start:end]
params, batch_loss = train_step(params, X_batch, y_batch, lr)
if (epoch + 1) % 50 == 0:
train_loss = loss_fn(params, X_train, y_train)
test_loss = loss_fn(params, X_test, y_test)
train_preds = predict(params, X_train)
test_preds = predict(params, X_test)
train_mae = jnp.mean(jnp.abs(train_preds - y_train))
test_mae = jnp.mean(jnp.abs(test_preds - y_test))
print(f"Epoch {epoch+1}/{epochs}")
print(f" Train Loss (MSE): {train_loss:.4f}, Train MAE: {train_mae:.4f}")
print(f" Test Loss (MSE): {test_loss:.4f}, Test MAE: {test_mae:.4f}")
return params
if __name__ == "__main__":
# Load data
print("Loading data...")
X, y = load_data("datasets/AI_Impact_on_Jobs_2030.csv")
print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")
# Train model
print("\nTraining neural network...")
params = train(X, y)
print("\nTraining complete!")
print(y)
Loading data... Dataset: 3000 samples, 14 features Training neural network... Epoch 50/500 Train Loss (MSE): 0.0625, Train MAE: 0.2120 Test Loss (MSE): 0.0571, Test MAE: 0.2013 Epoch 100/500 Train Loss (MSE): 0.0624, Train MAE: 0.2119 Test Loss (MSE): 0.0572, Test MAE: 0.2015 Epoch 150/500 Train Loss (MSE): 0.0624, Train MAE: 0.2117 Test Loss (MSE): 0.0573, Test MAE: 0.2017 Epoch 200/500 Train Loss (MSE): 0.0623, Train MAE: 0.2116 Test Loss (MSE): 0.0574, Test MAE: 0.2019 Epoch 250/500 Train Loss (MSE): 0.0623, Train MAE: 0.2115 Test Loss (MSE): 0.0575, Test MAE: 0.2020 Epoch 300/500 Train Loss (MSE): 0.0622, Train MAE: 0.2114 Test Loss (MSE): 0.0576, Test MAE: 0.2022 Epoch 350/500 Train Loss (MSE): 0.0622, Train MAE: 0.2113 Test Loss (MSE): 0.0576, Test MAE: 0.2024 Epoch 400/500 Train Loss (MSE): 0.0622, Train MAE: 0.2113 Test Loss (MSE): 0.0577, Test MAE: 0.2025 Epoch 450/500 Train Loss (MSE): 0.0622, Train MAE: 0.2112 Test Loss (MSE): 0.0578, Test MAE: 0.2027 Epoch 500/500 Train Loss (MSE): 0.0621, Train MAE: 0.2112 Test Loss (MSE): 0.0579, Test MAE: 0.2028 Training complete! [0.85 0.05 0.81 ... 0.39 0.46 0.49]
Conclusion: the Mean Absolute Error is around 20.¶
The MAE value itself indicates the average absolute error between predicted and actual values. The smaller the MAE, the better the model’s predictions align with the actual data. A MAE of 0 would mean a perfect prediction, but in most cases, achieving such perfection is unlikely. It’s important to compare the MAE to the scale of the target variable. Should I compare 20 to 0-1... if yes, this is a large error..
Experiment 6 - Using MLP classifier¶
Here is the code (similar to the code sample provided in class, but using a regressor)¶
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import numpy as np
import csv
# Define categorical mappings
JOB_TITLES = ['AI Engineer', 'Chef', 'Construction Worker', 'Customer Support',
'Data Scientist', 'Doctor', 'Financial Analyst', 'Graphic Designer',
'HR Specialist', 'Lawyer', 'Marketing Manager', 'Mechanic',
'Nurse', 'Research Scientist', 'Retail Worker', 'Security Guard',
'Software Engineer', 'Teacher', 'Truck Driver', 'UX Researcher']
EDUCATION_LEVELS = ["Bachelor's", 'High School', "Master's", 'PhD']
# Load and preprocess data
def load_data(filename, max_rows=5000, threshold=0.5):
with open(filename, 'r') as f:
reader = csv.DictReader(f)
rows = list(reader)[:max_rows] # Limit data for memory
# Select numerical features
numerical_features = ['Average_Salary', 'Years_Experience',
'AI_Exposure_Index', 'Tech_Growth_Factor', 'Skill_1',
'Skill_2', 'Skill_3', 'Skill_4',
'Skill_5', 'Skill_6', 'Skill_7',
'Skill_8', 'Skill_9', 'Skill_10']
X = []
y = []
for row in rows:
# Extract numerical features
features = [float(row[name]) for name in numerical_features]
# One-hot encode Job_Title.
job_title = row['Job_Title']
job_encoding = [1.0 if job_title == jt else 0.0 for jt in JOB_TITLES]
# One-hot encode Education_Level
education = row['Education_Level']
edu_encoding = [1.0 if education == el else 0.0 for el in EDUCATION_LEVELS]
# Combine all features
all_features = features + job_encoding + edu_encoding
X.append(all_features)
y.append(float(row['Automation_Probability_2030']))
return np.array(X), np.array(y)
# Normalize features
def normalize(X):
mean = np.mean(X, axis=0)
std = np.std(X, axis=0) + 1e-8
return (X - mean) / std
if __name__ == "__main__":
# Load data
print("Loading data...")
X, y = load_data("datasets/AI_Impact_on_Jobs_2030.csv")
print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")
# Normalize data
X_norm = normalize(X)
# Split data (80/20)
n_train = int(0.8 * len(X))
xtrain, xtest = X_norm[:n_train], X_norm[n_train:]
ytrain, ytest = y[:n_train], y[n_train:]
print(f"read {xtrain.shape[1]} byte data records, {xtrain.shape[0]} training examples, {xtest.shape[0]} testing examples\n")
regr = MLPRegressor(random_state=1, max_iter=2000, tol=0.1)
regr.fit(xtrain, ytrain)
predictions = regr.predict(xtest[:20])
print(predictions)
print(ytest[:20])
print(f"\ntest score: {regr.score(xtest,ytest)}\n")
Loading data... Dataset: 3000 samples, 38 features read 38 byte data records, 2400 training examples, 600 testing examples [0.58870038 0.34096235 1.04184036 0.35484229 0.14032916 0.53031915 0.56390841 0.45442037 0.1585959 0.18552326 0.5477567 0.30784507 0.62668054 0.49633547 0.70257281 0.36889893 0.07325958 1.01327444 0.8125246 0.73722747] [0.7 0.5 0.72 0.37 0.42 0.4 0.52 0.43 0.22 0.22 0.48 0.4 0.52 0.35 0.34 0.74 0.69 0.74 0.36 0.88] test score: 0.3363055580276779
Conclusion: 34% score..¶