import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import jax
import jax.numpy as jnp
from jax import random,grad,jit
from sklearn.model_selection import train_test_split

dr_raw = pd.read_csv("datasets/Loan_approval_data_2025.csv", delimiter=',', encoding='ascii')
df = dr_raw.drop(['customer_id'], axis=1)  # Drop customer_id as it is not useful for prediction

print("Dataset shape:", df.shape)

Dataset shape: (50000, 19)

df.head()

# Identify categorical variables to convert using one-hot encoding
categorical_vars = ['occupation_status', 'product_type', 'loan_intent']
# [Philippe] pandas.get_dummies() is a powerful function in the Pandas library used to convert categorical variables into dummy or indicator variables. 
# This process is commonly known as one-hot encoding, and it's essential for preparing categorical data for machine learning algorithms that typically require numerical input.
df_model = pd.get_dummies(df, columns=categorical_vars, drop_first=True)

# Define the target and features
target = 'loan_status'
features = [col for col in df_model.columns if col != target]
print("Features=",features)

X = df_model[features].to_numpy(dtype='int64')
y = df_model[target].to_numpy(dtype='int64')

# Split the dataset into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
print("xtrain",xtrain.shape)
print("xtest",xtest.shape)
print("ytrain",ytrain.shape)
print("ytest",ytest.shape)
print("",)

Features= ['age', 'years_employed', 'annual_income', 'credit_score', 'credit_history_years', 'savings_assets', 'current_debt', 'defaults_on_file', 'delinquencies_last_2yrs', 'derogatory_marks', 'loan_amount', 'interest_rate', 'debt_to_income_ratio', 'loan_to_income_ratio', 'payment_to_income_ratio', 'occupation_status_Self-Employed', 'occupation_status_Student', 'product_type_Line of Credit', 'product_type_Personal Loan', 'loan_intent_Debt Consolidation', 'loan_intent_Education', 'loan_intent_Home Improvement', 'loan_intent_Medical', 'loan_intent_Personal']
xtrain (40000, 24)
xtest (10000, 24)
ytrain (40000,)
ytest (10000,)

#
# hyperparameters
#
data_size = 24  # updated since we have 24 features in the loan approval dataset, instead of 784 points 
hidden_size = data_size//10  # does it mean we will end up with a hidden laer with only 2 neurons ? Do we need more layers ??
output_size = 10 # feels wrong... we just want one output
batch_size = 5000
train_steps = 25
learning_rate = 0.5
#
# init random key
#
key = random.PRNGKey(0)
#
# forward pass
#
@jit
def forward(params,layer_0):
    Weight1,bias1,Weight2,bias2 = params
    layer_1 = jnp.tanh(layer_0@Weight1+bias1)
    layer_2 = layer_1@Weight2+bias2
    return layer_2
#
# loss function
#
@jit
def loss(params,xtrain,ytrain):
    ypred = forward(params,xtrain)
    yscale = jnp.exp(ypred)/jnp.sum(jnp.exp(ypred),axis=1,keepdims=True)
    error = 1-jnp.mean(yscale[jnp.arange(len(ytrain)),ytrain])
    return error
#
# gradient update step
#
@jit
def update(params,xtrain,ytrain,rate):
    gradient = grad(loss)(params,xtrain,ytrain)
    return jax.tree.map(lambda params,gradient:params-rate*gradient,params,gradient)
#
# parameter initialization
#
def init_params(key,xsize,hidden,output):
    key1,key = random.split(key)
    Weight1 = 0.01*random.normal(key1,(xsize,hidden))
    bias1 = jnp.zeros(hidden)
    key2,key = random.split(key)
    Weight2 = 0.01*random.normal(key2,(hidden,output))
    bias2 = jnp.zeros(output)
    return (Weight1,bias1,Weight2,bias2)
#
# initialize parameters
#
params = init_params(key,data_size,hidden_size,output_size)
#
# train
#
print(f"starting loss: {loss(params,xtrain,ytrain):.3f}\n")
for batch in range(0,len(ytrain),batch_size):
    xbatch = xtrain[batch:batch+batch_size]
    ybatch = ytrain[batch:batch+batch_size]
    print(f"batch {batch}: ",end='')
    for step in range(train_steps):
        params = update(params,xbatch,ybatch,rate=learning_rate)
    print(f"loss {loss(params,xbatch,ybatch):.3f}")
#
# test
#
ypred = forward(params,xtest)
yscale = jnp.exp(ypred)/jnp.sum(jnp.exp(ypred),axis=1,keepdims=True)
error = 1-jnp.mean(yscale[jnp.arange(len(ytest)),ytest])
print(f"\ntest loss: {error:.3f}\n")

starting loss: 0.900

batch 0: loss 0.562
batch 5000: loss 0.466
batch 10000: loss 0.457
batch 15000: loss 0.449
batch 20000: loss 0.470
batch 25000: loss 0.456
batch 30000: loss 0.463
batch 35000: loss 0.446

test loss: 0.454

import jax
import jax.numpy as jnp
from jax import grad, jit
import csv

# Load and preprocess data
# [Philippe] The suggested code does not consider non-numerical features !!! but some of them could be meaningfull ! 

def load_data(filename, max_rows=5000):
    with open(filename, 'r') as f:
        reader = csv.DictReader(f)
        rows = list(reader)[:max_rows]  # Limit data for memory
   
    # Select numerical features
    feature_names = ['age', 'years_employed', 'annual_income', 'credit_score',
                     'credit_history_years', 'savings_assets', 'current_debt',
                     'defaults_on_file', 'delinquencies_last_2yrs', 'derogatory_marks',
                     'loan_amount', 'interest_rate', 'debt_to_income_ratio',
                     'loan_to_income_ratio', 'payment_to_income_ratio']
   
    X = []
    y = []
    for row in rows:
        features = [float(row[name]) for name in feature_names]
        X.append(features)
        y.append(float(row['loan_status']))
   
    return jnp.array(X), jnp.array(y)

# Normalize features
# [Philippe]  Something new here. It looks like an attempt to make all features less "different"
# 
def normalize(X):
    mean = jnp.mean(X, axis=0)
    std = jnp.std(X, axis=0) + 1e-8
    return (X - mean) / std

# Initialize network parameters
# [Philippe] In this code (when called), we will have 4 layers: 1 input (15 features => 15 neurons), 1 hidden (16 features, why one more ??), another hidden (8 neurons) and then an output with 1 neuron

def init_network(layer_sizes, key):
    params = []
    for i in range(len(layer_sizes) - 1):
        key, subkey = jax.random.split(key)
        w = jax.random.normal(subkey, (layer_sizes[i], layer_sizes[i+1])) * 0.1
        b = jnp.zeros(layer_sizes[i+1])
        params.append((w, b))
    return params

# Forward pass
# [Philippe] 

def forward(params, x):
    for i, (w, b) in enumerate(params[:-1]):
        x = jnp.tanh(jnp.dot(x, w) + b)
    w, b = params[-1]
    return jnp.dot(x, w) + b

# Sigmoid activation
# [Philippe] convert raw output to probability (0 to 1 range). Something new here... sigmoid used instead of softmax

def sigmoid(x):
    return 1 / (1 + jnp.exp(-x))

# Binary cross-entropy loss
# [Philippe] ???

def loss_fn(params, x, y):
    logits = forward(params, x)
    probs = sigmoid(logits.squeeze())
    return -jnp.mean(y * jnp.log(probs + 1e-8) + (1 - y) * jnp.log(1 - probs + 1e-8))

# Prediction function
# [Philippe] gets network output and convert it to 1 (loan approved) or 0 (loan denied)

def predict(params, x):
    logits = forward(params, x)
    return (sigmoid(logits.squeeze()) > 0.5).astype(jnp.float32)

# Training step
# [Philippe] 

@jit
def train_step(params, x, y, lr):
    loss, grads = jax.value_and_grad(loss_fn)(params, x, y)
    params = [(w - lr * dw, b - lr * db) for (w, b), (dw, db) in zip(params, grads)]
    return params, loss

# Main training loop
# [Philippe] there is data shuffeling here. According to the AI, the intent is to prenvent learning order pattern

def train(X, y, layer_sizes=[15, 16, 8, 1], epochs=500, lr=0.01, batch_size=64):
    key = jax.random.PRNGKey(42)
   
    # Normalize data
    X_norm = normalize(X)
   
    # Split data (80/20)
    n_train = int(0.8 * len(X))
    X_train, X_test = X_norm[:n_train], X_norm[n_train:]
    y_train, y_test = y[:n_train], y[n_train:]
   
    # Initialize network
    params = init_network(layer_sizes, key)
   
    # Training loop
    n_batches = len(X_train) // batch_size
   
    for epoch in range(epochs):
        # Shuffle data
        key, subkey = jax.random.split(key)
        perm = jax.random.permutation(subkey, len(X_train))
        X_train_shuffled = X_train[perm]
        y_train_shuffled = y_train[perm]
       
        for i in range(n_batches):
            start = i * batch_size
            end = start + batch_size
            X_batch = X_train_shuffled[start:end]
            y_batch = y_train_shuffled[start:end]
           
            params, batch_loss = train_step(params, X_batch, y_batch, lr)
       
        if (epoch + 1) % 50 == 0:
            train_loss = loss_fn(params, X_train, y_train)
            test_loss = loss_fn(params, X_test, y_test)
           
            train_preds = predict(params, X_train)
            test_preds = predict(params, X_test)
            train_acc = jnp.mean(train_preds == y_train)
            test_acc = jnp.mean(test_preds == y_test)
           
            print(f"Epoch {epoch+1}/{epochs}")
            print(f"  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
            print(f"  Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")
   
    return params

if __name__ == "__main__":
    # Load data
    print("Loading data...")
    X, y = load_data("datasets/Loan_approval_data_2025.csv")
    print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features")
   
    # Train model
    print("\nTraining neural network...")
    params = train(X, y)
   
    print("\nTraining complete!")

Loading data...
Dataset: 5000 samples, 15 features

Training neural network...
Epoch 50/500
  Train Loss: 0.3570, Train Acc: 0.8338
  Test Loss: 0.3895, Test Acc: 0.8170
Epoch 100/500
  Train Loss: 0.3481, Train Acc: 0.8348
  Test Loss: 0.3834, Test Acc: 0.8220
Epoch 150/500
  Train Loss: 0.3263, Train Acc: 0.8510
  Test Loss: 0.3626, Test Acc: 0.8390
Epoch 200/500
  Train Loss: 0.3031, Train Acc: 0.8650
  Test Loss: 0.3396, Test Acc: 0.8510
Epoch 250/500
  Train Loss: 0.2948, Train Acc: 0.8690
  Test Loss: 0.3357, Test Acc: 0.8480
Epoch 300/500
  Train Loss: 0.2903, Train Acc: 0.8683
  Test Loss: 0.3341, Test Acc: 0.8510
Epoch 350/500
  Train Loss: 0.2866, Train Acc: 0.8698
  Test Loss: 0.3336, Test Acc: 0.8520
Epoch 400/500
  Train Loss: 0.2832, Train Acc: 0.8718
  Test Loss: 0.3328, Test Acc: 0.8510
Epoch 450/500
  Train Loss: 0.2798, Train Acc: 0.8740
  Test Loss: 0.3309, Test Acc: 0.8550
Epoch 500/500
  Train Loss: 0.2765, Train Acc: 0.8760
  Test Loss: 0.3311, Test Acc: 0.8560

Training complete!

	age	occupation_status	years_employed	annual_income	credit_score	credit_history_years	savings_assets	current_debt	delinquencies_last_2yrs	product_type	loan_intent	loan_amount	interest_rate	debt_to_income_ratio	loan_to_income_ratio	payment_to_income_ratio	loan_status
0	40	Employed	17.2	25579	692	5.3	895	10820	0	Credit Card	Business	600	17.02	0.423	0.023	0.008	1
1	33	Employed	7.3	43087	627	3.5	169	16550	1	Personal Loan	Home Improvement	53300	14.10	0.384	1.237	0.412	0
2	42	Student	1.1	20840	689	8.4	17	7852	0	Credit Card	Debt Consolidation	2100	18.33	0.377	0.101	0.034	1
3	53	Student	0.5	29147	692	9.8	1480	11603	1	Credit Card	Business	2900	18.74	0.398	0.099	0.033	1
4	32	Employed	12.5	63657	630	7.2	209	12424	0	Personal Loan	Education	99600	13.92	0.195	1.565	0.522	1

Week 4: machine learning - "Loan approval" dataset¶

Context¶

Load dataset¶

Explore content¶

Prepare training and test datasets¶

Fist attempt. We want to predict whether a loan will be granted or not based on the data provided by the customer and its profile. Let's start by replicating the code presented during the class¶

Conclusion: it runs but it does work that well..¶

Second attempt, with code generated by Claude AI (using the prompt presented during the class) (with my comments inline)¶

Conclusion: 85% accuracy, small gap between training accuracy and test accuracy.. not bad !!¶