Philippe Libioulle - Fab Futures - Data Science
Home About

< Previous dataset - Week 6 home - Next dataset>

Week 6: density estimation - "Loan approval" dataset¶

Context¶

  • Source: Kaggle
  • Description: complete dataset of 50,000 loan applications across Credit Cards, Personal Loans, and Lines of Credit. Includes customer demographics, financial profiles, credit behavior, and approval decisions based on real US & Canadian banking criteria.
  • Credit: Brian Risk on Kaggle

Load dataset¶

In [8]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("datasets/Loan_approval_data_2025.csv", delimiter=',', encoding='ascii')
df = df.drop(['customer_id'], axis=1)  # Drop customer_id as it is not useful 

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)
        
print("Dataset shape:", df.shape) 
Dataset shape: (50000, 19)

Explore content¶

In [ ]:
df.head()
In [ ]:
x = df['credit_score']
y = df['interest_rate']
plt.figure(figsize=(6, 6))
plt.plot(x,y,'.', ms=1) 
plt.xlim(350, 900)
plt.ylim(4, 25)
plt.show()

# In this diagram, we can see two clouds, mainly due to the fact that interest rate is higher for credit cards than for other financials products

Experiment 1 - k-means¶

In [2]:
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi,voronoi_plot_2d
import numpy as np
import time
#
# k-means parameters
#
nclusters = 6
nsteps = 35
#
# load data (for credit cards only )
#
categories_to_keep = ['NO_Personal Loan', 'Credit Card', 'NO_Line of Credit']
filtered_df_multiple = df[df['product_type'].isin(categories_to_keep)]

xf = filtered_df_multiple['credit_score'].to_numpy()
npts = len(xf)
yf = filtered_df_multiple['interest_rate'] .to_numpy()

#
# choose starting points
#
indices = np.random.uniform(low=0,high=len(xf),size=nclusters).astype(int)
mux = xf[indices]
muy = yf[indices]
#
# plot before iteration
#
fig,ax = plt.subplots()
plt.plot(xf,yf,'.')
vor = Voronoi(np.stack((mux,muy),axis=1))
voronoi_plot_2d(vor,ax=ax,show_points=True,show_vertices=False,point_size=20)
plt.autoscale()
plt.title('before k-means iterations')
plt.xlim(350, 900)
plt.ylim(4, 25)
plt.show()
#
# do k-means iteration
#
for i in range(nsteps):
    #
    # find closest points
    #
    xm = np.outer(xf,np.ones(len(mux)))
    ym = np.outer(yf,np.ones(len(muy)))
    muxm = np.outer(np.ones(len(xf)),mux)
    muym = np.outer(np.ones(len(xf)),muy)
    distances = np.sqrt((xm-muxm)**2+(ym-muym)**2)
    mins = np.argmin(distances,axis=1)
    #
    # update means
    #
    for i in range(len(mux)):
        index = np.where(mins == i)
        mux[i] = np.sum(xf[index])/len(index[0])
        muy[i] = np.sum(yf[index])/len(index[0])
#
# plot after iteration
#
fig,ax = plt.subplots()
plt.plot(xf,yf,'.')
vor = Voronoi(np.stack((mux,muy),axis=1))
voronoi_plot_2d(vor,ax=ax,show_points=True,show_vertices=False,point_size=20)
#plt.autoscale()
plt.title('after k-means iteration')
plt.xlim(350, 900)
plt.ylim(4, 25)
plt.show()
No description has been provided for this image
No description has been provided for this image

Experiment 2 - Kernel Density Estimation¶

In [18]:
# This code starts with the code sample provided in the class. I used AI (Claude) to review it. 
# Here are the AI suggested changes: 
# - Only 3 iterations: Too few for convergence (improved to 20)
# - No convergence monitoring: Can't tell when algorithm has converged
# - Fixed random seed: Results vary each run (added seed for reproducibility)
# - Limited visualization: Only basic plots (added contour, convergence plots)

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("="*70)
print("GAUSSIAN MIXTURE MODEL (GMM) ANALYSIS")
print("Clustering Loan Data by Credit Score and Interest Rate")
print("="*70)

# Load and preprocess data
print("\n1. Loading data...")
df = pd.read_csv("datasets/Loan_approval_data_2025.csv", delimiter=',', encoding='ascii')
print(f"   Total records: {len(df)}")
print(f"   Columns: {df.columns.tolist()}")

# Drop customer_id as it's not useful for clustering
df = df.drop(['customer_id'], axis=1)

# Handle missing values
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Filter for specific product types
categories_to_keep = ['Personal Loan', 'Credit Card', 'Line of Credit']
filtered_df = df[df['product_type'].isin(categories_to_keep)]
print(f"   Filtered records (3 product types): {len(filtered_df)}")

# Extract features for clustering
xf2 = filtered_df['credit_score'].to_numpy()
yf2 = filtered_df['interest_rate'].to_numpy()

print(f"\n2. Data summary:")
print(f"   Credit Score: min={xf2.min():.0f}, max={xf2.max():.0f}, mean={xf2.mean():.1f}")
print(f"   Interest Rate: min={yf2.min():.2f}%, max={yf2.max():.2f}%, mean={yf2.mean():.2f}%")

# GMM Parameters
npts = len(xf2)
nclusters = 3
nsteps = 20  
nplot = 100

print(f"\n3. GMM Configuration:")
print(f"   Number of clusters: {nclusters}")
print(f"   Number of data points: {npts}")
print(f"   EM iterations: {nsteps}")

# Initialize cluster parameters
print("\n4. Initializing cluster parameters...")
np.random.seed(42)  # For reproducibility
indices = np.random.uniform(low=0, high=len(xf2), size=nclusters).astype(int)
mux = xf2[indices].copy()
muy = yf2[indices].copy()
varx = np.ones(nclusters) * (np.max(xf2)-np.min(xf2))**2 / (nclusters * 2)
vary = np.ones(nclusters) * (np.max(yf2)-np.min(yf2))**2 / (nclusters * 2)
pc = np.ones(nclusters) / nclusters

print(f"   Initial cluster centers (credit_score, interest_rate):")
for c in range(nclusters):
    print(f"      Cluster {c+1}: ({mux[c]:.1f}, {muy[c]:.2f}%)")

# Plot initial state
fig, ax = plt.subplots(figsize=(10, 6))
plt.scatter(xf2, yf2, alpha=0.3, s=20, label='Data points')
plt.errorbar(mux, muy, xerr=np.sqrt(varx), yerr=np.sqrt(vary), 
             fmt='r*', markersize=20, capsize=5, linewidth=2, label='Cluster centers')
plt.xlim(350, 900)
plt.ylim(4, 25)
plt.xlabel('Credit Score', fontsize=12)
plt.ylabel('Interest Rate (%)', fontsize=12)
plt.title('Before EM Iteration (Random Initialization)', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
plt.close()

# Expectation-Maximization Algorithm
print("\n6. Running EM Algorithm...")
log_likelihood_history = []

for iteration in range(nsteps):
    # E-step: Calculate responsibilities (posterior probabilities)
    xm = np.outer(xf2, np.ones(nclusters))
    ym = np.outer(yf2, np.ones(nclusters))
    muxm = np.outer(np.ones(npts), mux)
    muym = np.outer(np.ones(npts), muy)
    varxm = np.outer(np.ones(npts), varx)
    varym = np.outer(np.ones(npts), vary)
    
    # Calculate likelihood P(v|c) - probability of data given cluster
    pvgc = (1/np.sqrt(2*np.pi*varxm)) * np.exp(-(xm-muxm)**2/(2*varxm)) * \
           (1/np.sqrt(2*np.pi*varym)) * np.exp(-(ym-muym)**2/(2*varym))
    
    # Weighted likelihood P(v,c) = P(v|c) * P(c)
    pvc = pvgc * np.outer(np.ones(npts), pc)
    
    # Responsibilities P(c|v) - probability of cluster given data
    pcgv = pvc / np.outer(np.sum(pvc, 1), np.ones(nclusters))
    
    # Calculate log-likelihood for convergence monitoring
    log_likelihood = np.sum(np.log(np.sum(pvc, axis=1) + 1e-10))
    log_likelihood_history.append(log_likelihood)
    
    # M-step: Update parameters
    pc = np.sum(pcgv, 0) / npts
    mux = np.sum(xm * pcgv, 0) / (npts * pc)
    muy = np.sum(ym * pcgv, 0) / (npts * pc)
    varx = 0.1 + np.sum((xm - muxm)**2 * pcgv, 0) / (npts * pc)
    vary = 0.1 + np.sum((ym - muym)**2 * pcgv, 0) / (npts * pc)
    
    if (iteration + 1) % 5 == 0 or iteration == 0:
        print(f"   Iteration {iteration+1}/{nsteps}: Log-likelihood = {log_likelihood:.2f}")

print("\n7. Final cluster parameters:")
for c in range(nclusters):
    print(f"   Cluster {c+1}:")
    print(f"      Center: (Credit Score={mux[c]:.1f}, Interest Rate={muy[c]:.2f}%)")
    print(f"      Variance: (σ²_x={varx[c]:.1f}, σ²_y={vary[c]:.2f})")
    print(f"      Prior probability: {pc[c]:.3f} ({pc[c]*100:.1f}%)")

# Assign data points to clusters
cluster_assignments = np.argmax(pcgv, axis=1)

# Plot after iteration with cluster assignments
fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#1f77b4', '#ff7f0e', '#2ca02c','#3ba02c','#4aa02c']
for c in range(nclusters):
    mask = cluster_assignments == c
    plt.scatter(xf2[mask], yf2[mask], alpha=0.4, s=20, c=colors[c], 
                label=f'Cluster {c+1} (n={np.sum(mask)})')

plt.errorbar(mux, muy, xerr=np.sqrt(varx), yerr=np.sqrt(vary), 
             fmt='r*', markersize=20, capsize=5, linewidth=2, label='Cluster centers')
plt.xlim(350, 900)
plt.ylim(4, 25)
plt.xlabel('Credit Score', fontsize=12)
plt.ylabel('Interest Rate (%)', fontsize=12)
plt.title('After EM Iteration (Converged Clusters)', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
plt.close()

# Plot convergence
fig, ax = plt.subplots(figsize=(10, 6))
plt.plot(range(1, nsteps+1), log_likelihood_history, 'b-o', linewidth=2, markersize=6)
plt.xlabel('Iteration', fontsize=12)
plt.ylabel('Log-Likelihood', fontsize=12)
plt.title('EM Algorithm Convergence', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
plt.close()

# 3D probability distribution
xplot = np.linspace(np.min(xf2), np.max(xf2), nplot)
yplot = np.linspace(np.min(yf2), np.max(yf2), nplot)
X, Y = np.meshgrid(xplot, yplot)
p = np.zeros((nplot, nplot))

for c in range(nclusters):
    p += np.exp(-(X-mux[c])**2/(2*varx[c])) / np.sqrt(2*np.pi*varx[c]) * \
         np.exp(-(Y-muy[c])**2/(2*vary[c])) / np.sqrt(2*np.pi*vary[c]) * pc[c]

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
surf = ax.plot_surface(X, Y, p, cmap='viridis', alpha=0.8)
ax.set_xlabel('Credit Score', fontsize=12)
ax.set_ylabel('Interest Rate (%)', fontsize=12)
ax.set_zlabel('Probability Density', fontsize=12)
ax.set_title('GMM Probability Distribution (3D)', fontsize=14, fontweight='bold')
fig.colorbar(surf, shrink=0.5, aspect=5)
plt.tight_layout()
plt.show()
plt.close()

# Contour plot
fig, ax = plt.subplots(figsize=(10, 6))
contour = plt.contourf(X, Y, p, levels=20, cmap='viridis', alpha=0.8)
plt.scatter(xf2, yf2, alpha=0.2, s=10, c='white', edgecolors='black', linewidths=0.5)
plt.colorbar(contour, label='Probability Density')
plt.xlabel('Credit Score', fontsize=12)
plt.ylabel('Interest Rate (%)', fontsize=12)
plt.title('GMM Probability Distribution (Contour)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
plt.close()

# Cluster statistics
print("\n" + "="*70)
print("CLUSTER ANALYSIS SUMMARY")
print("="*70)
for c in range(nclusters):
    mask = cluster_assignments == c
    cluster_data = filtered_df[mask]
    print(f"\nCluster {c+1} ({np.sum(mask)} customers, {np.sum(mask)/len(filtered_df)*100:.1f}%):")
    print(f"  Credit Score: {xf2[mask].mean():.1f} ± {xf2[mask].std():.1f}")
    print(f"  Interest Rate: {yf2[mask].mean():.2f}% ± {yf2[mask].std():.2f}%")
    print(f"  Loan Status (Approved): {cluster_data['loan_status'].mean()*100:.1f}%")
    print(f"  Product Type Distribution:")
    for product, count in cluster_data['product_type'].value_counts().items():
        print(f"    - {product}: {count} ({count/len(cluster_data)*100:.1f}%)")
======================================================================
GAUSSIAN MIXTURE MODEL (GMM) ANALYSIS
Clustering Loan Data by Credit Score and Interest Rate
======================================================================

1. Loading data...
   Total records: 50000
   Columns: ['customer_id', 'age', 'occupation_status', 'years_employed', 'annual_income', 'credit_score', 'credit_history_years', 'savings_assets', 'current_debt', 'defaults_on_file', 'delinquencies_last_2yrs', 'derogatory_marks', 'product_type', 'loan_intent', 'loan_amount', 'interest_rate', 'debt_to_income_ratio', 'loan_to_income_ratio', 'payment_to_income_ratio', 'loan_status']
   Filtered records (3 product types): 50000

2. Data summary:
   Credit Score: min=348, max=850, mean=643.6
   Interest Rate: min=6.00%, max=23.00%, mean=15.50%

3. GMM Configuration:
   Number of clusters: 3
   Number of data points: 50000
   EM iterations: 20

4. Initializing cluster parameters...
   Initial cluster centers (credit_score, interest_rate):
      Cluster 1: (640.0, 20.28%)
      Cluster 2: (659.0, 10.48%)
      Cluster 3: (547.0, 15.08%)
No description has been provided for this image
6. Running EM Algorithm...
   Iteration 1/20: Log-likelihood = -473242.99
   Iteration 5/20: Log-likelihood = -416556.71
   Iteration 10/20: Log-likelihood = -413901.43
   Iteration 15/20: Log-likelihood = -413637.34
   Iteration 20/20: Log-likelihood = -413436.60

7. Final cluster parameters:
   Cluster 1:
      Center: (Credit Score=597.8, Interest Rate=18.84%)
      Variance: (σ²_x=2579.4, σ²_y=8.78)
      Prior probability: 0.356 (35.6%)
   Cluster 2:
      Center: (Credit Score=692.0, Interest Rate=11.64%)
      Variance: (σ²_x=2555.6, σ²_y=6.36)
      Prior probability: 0.337 (33.7%)
   Cluster 3:
      Center: (Credit Score=643.6, Interest Rate=15.85%)
      Variance: (σ²_x=2850.6, σ²_y=7.65)
      Prior probability: 0.307 (30.7%)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
======================================================================
CLUSTER ANALYSIS SUMMARY
======================================================================

Cluster 1 (17580 customers, 35.2%):
  Credit Score: 585.2 ± 43.3
  Interest Rate: 19.37% ± 2.60%
  Loan Status (Approved): 33.0%
  Product Type Distribution:
    - Credit Card: 12533 (71.3%)
    - Personal Loan: 4038 (23.0%)
    - Line of Credit: 1009 (5.7%)

Cluster 2 (16925 customers, 33.9%):
  Credit Score: 694.9 ± 48.1
  Interest Rate: 11.15% ± 1.95%
  Loan Status (Approved): 73.0%
  Product Type Distribution:
    - Personal Loan: 8408 (49.7%)
    - Line of Credit: 6959 (41.1%)
    - Credit Card: 1558 (9.2%)

Cluster 3 (15495 customers, 31.0%):
  Credit Score: 653.9 ± 44.9
  Interest Rate: 15.86% ± 1.94%
  Loan Status (Approved): 60.4%
  Product Type Distribution:
    - Credit Card: 8364 (54.0%)
    - Personal Loan: 5077 (32.8%)
    - Line of Credit: 2054 (13.3%)

The code uses unsupervised machine learning to automatically discover 3 customer segments in loan data based on credit scores and interest rates. It's like finding natural groupings without being told what to look for.

The Algorithm (Expectation-Maximization):

  • Starts with random guesses for 3 cluster centers
  • E-Step: Calculates "how likely is each customer to belong to each cluster?"
  • M-Step: Updates cluster parameters based on those probabilities
  • Repeats until the clusters stabilize

What It Found:

  • High-Risk Cluster (35%): Credit score ~585, interest rate ~19.4%, mostly credit cards, 33% approval
  • Low-Risk Cluster (34%): Credit score ~695, interest rate ~11.2%, personal loans/LOCs, 73% approval
  • Medium-Risk Cluster (31%): Credit score ~654, interest rate ~15.9%, mixed products, 60% approval
In [51]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(8, 8))
# Create a 2D histogram
plt.hist2d(xf2, yf2, bins=50, cmap='viridis')

# Add a colorbar for reference
plt.colorbar(label='Count')

# Add labels and title
plt.xlabel('Interest rate')
plt.ylabel('Credit score')
plt.title('2D Histogram of interest rate and credit score')

# Display the plot
plt.show()

# this diagram shows a peak in one zone, around 600-700 score , this is the same information we saw on the 3D diagram before. 
# There is still a secondary peak, lower, in the other cloud.
No description has been provided for this image

Experiment 3 - Cluster-Weighted Modeling (WORK IN PROGRESS..)¶

In [127]:
import matplotlib.pyplot as plt
import numpy as np
#
# state cluster-weighted modeling parameters
#
dim = 2
nstates = 2
nclusters = 5
nsamples = 100
momentum = 0.9
rate = 0.1
min_var = 0.1
niterations = 100
#
# load data:   NOTE: in this experiment, I selected two other features: age and interest rate
#

feature1 = 'interest_rate'
feature1_lim = [0,25]
feature2 = 'age' 
feature2_lim = [15,70]

df_ones = df[df['loan_status']== 1]
zero_states = np.stack((df_ones[feature1].to_numpy(), df_ones[feature2].to_numpy()), axis=0) 
nzeros = zero_states.shape[1]
print('nzeros: ', nzeros)
print('Count of denied loans: ' ,len(zero_states[0]))
one_states =  np.stack((df_zeroes[feature1].to_numpy(),df_zeroes[feature2].to_numpy()), axis=0)  
print('Count of granted loans: ', len(one_states[0]))
nones = one_states.shape[1]
print('nones: ', nones)
print('X - min zero_states: ',min(zero_states[0]), ' max zero_states: ',max(zero_states[0]), ' - min one_states: ',min(one_states[0]),'max one_states: ',max(one_states[0]))
print('Y - min zero_states: ',min(zero_states[1]), ' max zero_states: ',max(zero_states[1]), ' - min one_states: ',min(one_states[1]),'max one_states: ',max(one_states[1]))
points = np.hstack((zero_states,one_states))
states = np.append(np.zeros(nzeros,dtype=int),np.ones(nones,dtype=int))
npts = len(states)
print('npts: ', npts)
nzeros:  27523
Count of denied loans:  27523
Count of granted loans:  22477
nones:  22477
X - min zero_states:  6.0  max zero_states:  23.0  - min one_states:  6.03 max one_states:  23.0
Y - min zero_states:  18.0  max zero_states:  70.0  - min one_states:  18.0 max one_states:  70.0
npts:  50000
In [128]:
#
# initialize arrays
#
indices = np.random.randint(0,npts,nclusters)
means = points[:,indices]
dmean = np.zeros((dim,nclusters))
covariances = np.zeros((nclusters,2,2))
covariances[:,0,0] = np.var(points[0,:])
covariances[:,1,1] = np.var(points[1,:])
pc = np.ones(nclusters)/nclusters # p(c)
pxgc = np.zeros((nclusters,npts)) # p(x|c)
psgxc = np.ones((nclusters,nstates))/nstates # p(s|xc)
#
# plot data
#
plt.plot(zero_states[0,:],zero_states[1,:],ls='',marker='.',markeredgecolor='green')
plt.plot(one_states[0,:],one_states[1,:],ls='',marker='.',markeredgecolor='orange')
plt.xlim(feature1_lim)
plt.ylim(feature2_lim)
#
# plot clusters
#
def plot_covar():
   for cluster in range(means.shape[1]):
      x = means[0,cluster]
      y = means[1,cluster]
      type = np.argmax(psgxc[cluster,:])
      w,v = np.linalg.eig(covariances[cluster])
      w = np.sqrt(w)
      plt.plot([x-w[0]*v[0,0],x+w[0]*v[0,0]],[y-w[0]*v[1,0],y+w[0]*v[1,0]],'k')
      plt.plot([x-w[1]*v[0,1],x+w[1]*v[0,1]],[y-w[1]*v[1,1],y+w[1]*v[1,1]],'k')
      plt.plot([x],[y],'k',marker=f'${type}$',markersize=15)
plot_covar()
plt.title('covariance clusters before iteration')
plt.show()
#
# do E-M iteration
#
for i in range(niterations):
   for cluster in range(nclusters):
      mean = np.outer(means[:,cluster],np.ones(npts))
      cinv = np.linalg.pinv(covariances[cluster])
      pxgc[cluster,:] = (np.sqrt(np.linalg.det(cinv))/(2*np.pi)**(dim/2))\
         *np.exp(-np.sum((points-mean)*(cinv@(points-mean)),0)/2)
   pxc = pxgc*np.outer(pc,np.ones(npts))
   pcgx = pxc/np.outer(np.ones(nclusters),np.sum(pxc,0))
   psxc = psgxc[:,states]*pxc
   pcgsx = psxc/np.outer(np.ones(nclusters),np.sum(psxc,0))
   pc = np.sum(pcgsx,1)/npts
   for cluster in range(nclusters):
      newmean = momentum*dmean[:,cluster]\
         +np.sum(points*np.outer(np.ones(dim),pcgsx[cluster,:]),1)\
         /np.sum(np.outer(np.ones(dim),pcgsx[cluster,:]),1)
      dmean[:,cluster] = newmean-means[:,cluster]
      means[:,cluster] += rate*dmean[:,cluster]
      m = np.outer(means[:,cluster],np.ones(npts))
      for d in range(dim):
         covariances[cluster][:,d] = np.sum(np.outer(\
            np.ones(dim),points[d,:]-m[d,:])*(points-m)*\
            np.outer(np.ones(dim),pcgsx[cluster,:]),1)\
            /np.sum(np.outer(np.ones(dim),pcgsx[cluster,:]),1)\
            +min_var
   for state in range(nstates):
      index = np.argwhere(states == state)
      psgxc[:,state] = np.sum(pcgsx[:,index],1)[:,0]/np.sum(pcgsx[:,:],1)
#
# plot data
#
plt.plot(zero_states[0,:],zero_states[1,:],ls='',marker='$0$',markeredgecolor='green')
plt.plot(one_states[0,:],one_states[1,:],ls='',marker='$1$',markeredgecolor='orange')
plt.xlim(feature1_lim)
plt.ylim(feature2_lim)
#
# plot clusters
#
plot_covar()
#
# plot decision boundary
#
ngrid = 100
xmin = np.min(points[0,:])
xmax = np.max(points[0,:])
ymin = np.min(points[1,:])
ymax = np.max(points[1,:])
x = np.linspace(xmin,xmax,ngrid)
y = np.linspace(ymin,ymax,ngrid)
mx,my = np.meshgrid(x,y)
x = np.reshape(mx,(ngrid*ngrid))
y = np.reshape(my,(ngrid*ngrid))
plotpoints = np.vstack((x,y))
pxgc = np.zeros((nclusters,ngrid*ngrid))
for cluster in range(nclusters):
   mean = np.outer(means[:,cluster],np.ones(ngrid*ngrid))
   cinv = np.linalg.pinv(covariances[cluster])
   pxgc[cluster,:] = (np.sqrt(np.linalg.det(cinv))/(2*np.pi)**(dim/2))\
      *np.exp(-np.sum((plotpoints-mean)*(cinv@(plotpoints-mean)),0)/2)
pxc = pxgc*np.outer(pc,np.ones(ngrid*ngrid))
p = np.sum(np.outer(psgxc[:,0],np.ones(ngrid*ngrid))*pxc,0)/np.sum(pxc,0)
p = np.reshape(p,(ngrid,ngrid))
plt.contour(mx,my,p,[0.5])
plt.title('covariance clusters and decision boundaries after iteration')
plt.show()
#
# plot probability surface
#
fig, ax = plt.subplots(subplot_kw={"projection":"3d"})
ax.plot_wireframe(mx,my,p,rstride=10,cstride=10,color='gray')
plt.plot(zero_states[0,:],zero_states[1,:],zdir='z',ls='',marker='r$0$',markeredgecolor='green')
plt.plot(one_states[0,:],one_states[1,:],zdir='z',ls='',marker='$1$',markeredgecolor='orange')
plt.title('probability of state 0')
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

ok, I'm lost...

In [132]:
import matplotlib.pyplot as plt
import numpy as np

feature1 = 'interest_rate'
feature2 = 'age'

df_denied = df[df['loan_status']== 0]

plt.figure(figsize=(8, 8))
# Create a 2D histogram
plt.hist2d(df_denied[feature1], df_denied[feature2], bins=20, cmap='viridis')

# Add a colorbar for reference
plt.colorbar(label='Count - denied loans')

# Add labels and title
plt.xlabel('Interest rate')
plt.ylabel('Age')
plt.title('2D Histogram of interest rate and age for denied loans')

# Display the plot
feature1_lim = [5,25]
feature2_lim = [15,70]
plt.xlim(feature1_lim)
plt.ylim(feature2_lim)
plt.show()
No description has been provided for this image
In [133]:
import matplotlib.pyplot as plt
import numpy as np

feature1 = 'interest_rate'
feature2 = 'age'

df_granted = df[df['loan_status']== 1]

plt.figure(figsize=(8, 8))
# Create a 2D histogram
plt.hist2d(df_granted[feature1], df_granted[feature2], bins=20, cmap='viridis')

# Add a colorbar for reference
plt.colorbar(label='Count - granted loans')

# Add labels and title
plt.xlabel('Interest rate')
plt.ylabel('Age')
plt.title('2D Histogram of interest rate and age for granted loans')

# Display the plot
feature1_lim = [5,25]
feature2_lim = [15,70]
plt.xlim(feature1_lim)
plt.ylim(feature2_lim)
plt.show()
No description has been provided for this image
In [ ]: