Week 6: density estimation - "Loan approval" dataset¶
Context¶
- Source: Kaggle
- Description: complete dataset of 50,000 loan applications across Credit Cards, Personal Loans, and Lines of Credit. Includes customer demographics, financial profiles, credit behavior, and approval decisions based on real US & Canadian banking criteria.
- Credit: Brian Risk on Kaggle
Load dataset¶
In [8]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("datasets/Loan_approval_data_2025.csv", delimiter=',', encoding='ascii')
df = df.drop(['customer_id'], axis=1) # Drop customer_id as it is not useful
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
for col in numeric_cols:
if df[col].isnull().sum() > 0:
df[col].fillna(df[col].median(), inplace=True)
for col in categorical_cols:
if df[col].isnull().sum() > 0:
df[col].fillna(df[col].mode()[0], inplace=True)
print("Dataset shape:", df.shape)
Dataset shape: (50000, 19)
Explore content¶
In [ ]:
df.head()
In [ ]:
x = df['credit_score']
y = df['interest_rate']
plt.figure(figsize=(6, 6))
plt.plot(x,y,'.', ms=1)
plt.xlim(350, 900)
plt.ylim(4, 25)
plt.show()
# In this diagram, we can see two clouds, mainly due to the fact that interest rate is higher for credit cards than for other financials products
Experiment 1 - k-means¶
In [2]:
import matplotlib.pyplot as plt
from scipy.spatial import Voronoi,voronoi_plot_2d
import numpy as np
import time
#
# k-means parameters
#
nclusters = 6
nsteps = 35
#
# load data (for credit cards only )
#
categories_to_keep = ['NO_Personal Loan', 'Credit Card', 'NO_Line of Credit']
filtered_df_multiple = df[df['product_type'].isin(categories_to_keep)]
xf = filtered_df_multiple['credit_score'].to_numpy()
npts = len(xf)
yf = filtered_df_multiple['interest_rate'] .to_numpy()
#
# choose starting points
#
indices = np.random.uniform(low=0,high=len(xf),size=nclusters).astype(int)
mux = xf[indices]
muy = yf[indices]
#
# plot before iteration
#
fig,ax = plt.subplots()
plt.plot(xf,yf,'.')
vor = Voronoi(np.stack((mux,muy),axis=1))
voronoi_plot_2d(vor,ax=ax,show_points=True,show_vertices=False,point_size=20)
plt.autoscale()
plt.title('before k-means iterations')
plt.xlim(350, 900)
plt.ylim(4, 25)
plt.show()
#
# do k-means iteration
#
for i in range(nsteps):
#
# find closest points
#
xm = np.outer(xf,np.ones(len(mux)))
ym = np.outer(yf,np.ones(len(muy)))
muxm = np.outer(np.ones(len(xf)),mux)
muym = np.outer(np.ones(len(xf)),muy)
distances = np.sqrt((xm-muxm)**2+(ym-muym)**2)
mins = np.argmin(distances,axis=1)
#
# update means
#
for i in range(len(mux)):
index = np.where(mins == i)
mux[i] = np.sum(xf[index])/len(index[0])
muy[i] = np.sum(yf[index])/len(index[0])
#
# plot after iteration
#
fig,ax = plt.subplots()
plt.plot(xf,yf,'.')
vor = Voronoi(np.stack((mux,muy),axis=1))
voronoi_plot_2d(vor,ax=ax,show_points=True,show_vertices=False,point_size=20)
#plt.autoscale()
plt.title('after k-means iteration')
plt.xlim(350, 900)
plt.ylim(4, 25)
plt.show()
Experiment 2 - Kernel Density Estimation¶
In [18]:
# This code starts with the code sample provided in the class. I used AI (Claude) to review it.
# Here are the AI suggested changes:
# - Only 3 iterations: Too few for convergence (improved to 20)
# - No convergence monitoring: Can't tell when algorithm has converged
# - Fixed random seed: Results vary each run (added seed for reproducibility)
# - Limited visualization: Only basic plots (added contour, convergence plots)
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# Set style for better-looking plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
print("="*70)
print("GAUSSIAN MIXTURE MODEL (GMM) ANALYSIS")
print("Clustering Loan Data by Credit Score and Interest Rate")
print("="*70)
# Load and preprocess data
print("\n1. Loading data...")
df = pd.read_csv("datasets/Loan_approval_data_2025.csv", delimiter=',', encoding='ascii')
print(f" Total records: {len(df)}")
print(f" Columns: {df.columns.tolist()}")
# Drop customer_id as it's not useful for clustering
df = df.drop(['customer_id'], axis=1)
# Handle missing values
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
for col in numeric_cols:
if df[col].isnull().sum() > 0:
df[col].fillna(df[col].median(), inplace=True)
for col in categorical_cols:
if df[col].isnull().sum() > 0:
df[col].fillna(df[col].mode()[0], inplace=True)
# Filter for specific product types
categories_to_keep = ['Personal Loan', 'Credit Card', 'Line of Credit']
filtered_df = df[df['product_type'].isin(categories_to_keep)]
print(f" Filtered records (3 product types): {len(filtered_df)}")
# Extract features for clustering
xf2 = filtered_df['credit_score'].to_numpy()
yf2 = filtered_df['interest_rate'].to_numpy()
print(f"\n2. Data summary:")
print(f" Credit Score: min={xf2.min():.0f}, max={xf2.max():.0f}, mean={xf2.mean():.1f}")
print(f" Interest Rate: min={yf2.min():.2f}%, max={yf2.max():.2f}%, mean={yf2.mean():.2f}%")
# GMM Parameters
npts = len(xf2)
nclusters = 3
nsteps = 20
nplot = 100
print(f"\n3. GMM Configuration:")
print(f" Number of clusters: {nclusters}")
print(f" Number of data points: {npts}")
print(f" EM iterations: {nsteps}")
# Initialize cluster parameters
print("\n4. Initializing cluster parameters...")
np.random.seed(42) # For reproducibility
indices = np.random.uniform(low=0, high=len(xf2), size=nclusters).astype(int)
mux = xf2[indices].copy()
muy = yf2[indices].copy()
varx = np.ones(nclusters) * (np.max(xf2)-np.min(xf2))**2 / (nclusters * 2)
vary = np.ones(nclusters) * (np.max(yf2)-np.min(yf2))**2 / (nclusters * 2)
pc = np.ones(nclusters) / nclusters
print(f" Initial cluster centers (credit_score, interest_rate):")
for c in range(nclusters):
print(f" Cluster {c+1}: ({mux[c]:.1f}, {muy[c]:.2f}%)")
# Plot initial state
fig, ax = plt.subplots(figsize=(10, 6))
plt.scatter(xf2, yf2, alpha=0.3, s=20, label='Data points')
plt.errorbar(mux, muy, xerr=np.sqrt(varx), yerr=np.sqrt(vary),
fmt='r*', markersize=20, capsize=5, linewidth=2, label='Cluster centers')
plt.xlim(350, 900)
plt.ylim(4, 25)
plt.xlabel('Credit Score', fontsize=12)
plt.ylabel('Interest Rate (%)', fontsize=12)
plt.title('Before EM Iteration (Random Initialization)', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
plt.close()
# Expectation-Maximization Algorithm
print("\n6. Running EM Algorithm...")
log_likelihood_history = []
for iteration in range(nsteps):
# E-step: Calculate responsibilities (posterior probabilities)
xm = np.outer(xf2, np.ones(nclusters))
ym = np.outer(yf2, np.ones(nclusters))
muxm = np.outer(np.ones(npts), mux)
muym = np.outer(np.ones(npts), muy)
varxm = np.outer(np.ones(npts), varx)
varym = np.outer(np.ones(npts), vary)
# Calculate likelihood P(v|c) - probability of data given cluster
pvgc = (1/np.sqrt(2*np.pi*varxm)) * np.exp(-(xm-muxm)**2/(2*varxm)) * \
(1/np.sqrt(2*np.pi*varym)) * np.exp(-(ym-muym)**2/(2*varym))
# Weighted likelihood P(v,c) = P(v|c) * P(c)
pvc = pvgc * np.outer(np.ones(npts), pc)
# Responsibilities P(c|v) - probability of cluster given data
pcgv = pvc / np.outer(np.sum(pvc, 1), np.ones(nclusters))
# Calculate log-likelihood for convergence monitoring
log_likelihood = np.sum(np.log(np.sum(pvc, axis=1) + 1e-10))
log_likelihood_history.append(log_likelihood)
# M-step: Update parameters
pc = np.sum(pcgv, 0) / npts
mux = np.sum(xm * pcgv, 0) / (npts * pc)
muy = np.sum(ym * pcgv, 0) / (npts * pc)
varx = 0.1 + np.sum((xm - muxm)**2 * pcgv, 0) / (npts * pc)
vary = 0.1 + np.sum((ym - muym)**2 * pcgv, 0) / (npts * pc)
if (iteration + 1) % 5 == 0 or iteration == 0:
print(f" Iteration {iteration+1}/{nsteps}: Log-likelihood = {log_likelihood:.2f}")
print("\n7. Final cluster parameters:")
for c in range(nclusters):
print(f" Cluster {c+1}:")
print(f" Center: (Credit Score={mux[c]:.1f}, Interest Rate={muy[c]:.2f}%)")
print(f" Variance: (σ²_x={varx[c]:.1f}, σ²_y={vary[c]:.2f})")
print(f" Prior probability: {pc[c]:.3f} ({pc[c]*100:.1f}%)")
# Assign data points to clusters
cluster_assignments = np.argmax(pcgv, axis=1)
# Plot after iteration with cluster assignments
fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#1f77b4', '#ff7f0e', '#2ca02c','#3ba02c','#4aa02c']
for c in range(nclusters):
mask = cluster_assignments == c
plt.scatter(xf2[mask], yf2[mask], alpha=0.4, s=20, c=colors[c],
label=f'Cluster {c+1} (n={np.sum(mask)})')
plt.errorbar(mux, muy, xerr=np.sqrt(varx), yerr=np.sqrt(vary),
fmt='r*', markersize=20, capsize=5, linewidth=2, label='Cluster centers')
plt.xlim(350, 900)
plt.ylim(4, 25)
plt.xlabel('Credit Score', fontsize=12)
plt.ylabel('Interest Rate (%)', fontsize=12)
plt.title('After EM Iteration (Converged Clusters)', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
plt.close()
# Plot convergence
fig, ax = plt.subplots(figsize=(10, 6))
plt.plot(range(1, nsteps+1), log_likelihood_history, 'b-o', linewidth=2, markersize=6)
plt.xlabel('Iteration', fontsize=12)
plt.ylabel('Log-Likelihood', fontsize=12)
plt.title('EM Algorithm Convergence', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
plt.close()
# 3D probability distribution
xplot = np.linspace(np.min(xf2), np.max(xf2), nplot)
yplot = np.linspace(np.min(yf2), np.max(yf2), nplot)
X, Y = np.meshgrid(xplot, yplot)
p = np.zeros((nplot, nplot))
for c in range(nclusters):
p += np.exp(-(X-mux[c])**2/(2*varx[c])) / np.sqrt(2*np.pi*varx[c]) * \
np.exp(-(Y-muy[c])**2/(2*vary[c])) / np.sqrt(2*np.pi*vary[c]) * pc[c]
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
surf = ax.plot_surface(X, Y, p, cmap='viridis', alpha=0.8)
ax.set_xlabel('Credit Score', fontsize=12)
ax.set_ylabel('Interest Rate (%)', fontsize=12)
ax.set_zlabel('Probability Density', fontsize=12)
ax.set_title('GMM Probability Distribution (3D)', fontsize=14, fontweight='bold')
fig.colorbar(surf, shrink=0.5, aspect=5)
plt.tight_layout()
plt.show()
plt.close()
# Contour plot
fig, ax = plt.subplots(figsize=(10, 6))
contour = plt.contourf(X, Y, p, levels=20, cmap='viridis', alpha=0.8)
plt.scatter(xf2, yf2, alpha=0.2, s=10, c='white', edgecolors='black', linewidths=0.5)
plt.colorbar(contour, label='Probability Density')
plt.xlabel('Credit Score', fontsize=12)
plt.ylabel('Interest Rate (%)', fontsize=12)
plt.title('GMM Probability Distribution (Contour)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
plt.close()
# Cluster statistics
print("\n" + "="*70)
print("CLUSTER ANALYSIS SUMMARY")
print("="*70)
for c in range(nclusters):
mask = cluster_assignments == c
cluster_data = filtered_df[mask]
print(f"\nCluster {c+1} ({np.sum(mask)} customers, {np.sum(mask)/len(filtered_df)*100:.1f}%):")
print(f" Credit Score: {xf2[mask].mean():.1f} ± {xf2[mask].std():.1f}")
print(f" Interest Rate: {yf2[mask].mean():.2f}% ± {yf2[mask].std():.2f}%")
print(f" Loan Status (Approved): {cluster_data['loan_status'].mean()*100:.1f}%")
print(f" Product Type Distribution:")
for product, count in cluster_data['product_type'].value_counts().items():
print(f" - {product}: {count} ({count/len(cluster_data)*100:.1f}%)")
======================================================================
GAUSSIAN MIXTURE MODEL (GMM) ANALYSIS
Clustering Loan Data by Credit Score and Interest Rate
======================================================================
1. Loading data...
Total records: 50000
Columns: ['customer_id', 'age', 'occupation_status', 'years_employed', 'annual_income', 'credit_score', 'credit_history_years', 'savings_assets', 'current_debt', 'defaults_on_file', 'delinquencies_last_2yrs', 'derogatory_marks', 'product_type', 'loan_intent', 'loan_amount', 'interest_rate', 'debt_to_income_ratio', 'loan_to_income_ratio', 'payment_to_income_ratio', 'loan_status']
Filtered records (3 product types): 50000
2. Data summary:
Credit Score: min=348, max=850, mean=643.6
Interest Rate: min=6.00%, max=23.00%, mean=15.50%
3. GMM Configuration:
Number of clusters: 3
Number of data points: 50000
EM iterations: 20
4. Initializing cluster parameters...
Initial cluster centers (credit_score, interest_rate):
Cluster 1: (640.0, 20.28%)
Cluster 2: (659.0, 10.48%)
Cluster 3: (547.0, 15.08%)
6. Running EM Algorithm...
Iteration 1/20: Log-likelihood = -473242.99
Iteration 5/20: Log-likelihood = -416556.71
Iteration 10/20: Log-likelihood = -413901.43
Iteration 15/20: Log-likelihood = -413637.34
Iteration 20/20: Log-likelihood = -413436.60
7. Final cluster parameters:
Cluster 1:
Center: (Credit Score=597.8, Interest Rate=18.84%)
Variance: (σ²_x=2579.4, σ²_y=8.78)
Prior probability: 0.356 (35.6%)
Cluster 2:
Center: (Credit Score=692.0, Interest Rate=11.64%)
Variance: (σ²_x=2555.6, σ²_y=6.36)
Prior probability: 0.337 (33.7%)
Cluster 3:
Center: (Credit Score=643.6, Interest Rate=15.85%)
Variance: (σ²_x=2850.6, σ²_y=7.65)
Prior probability: 0.307 (30.7%)
======================================================================
CLUSTER ANALYSIS SUMMARY
======================================================================
Cluster 1 (17580 customers, 35.2%):
Credit Score: 585.2 ± 43.3
Interest Rate: 19.37% ± 2.60%
Loan Status (Approved): 33.0%
Product Type Distribution:
- Credit Card: 12533 (71.3%)
- Personal Loan: 4038 (23.0%)
- Line of Credit: 1009 (5.7%)
Cluster 2 (16925 customers, 33.9%):
Credit Score: 694.9 ± 48.1
Interest Rate: 11.15% ± 1.95%
Loan Status (Approved): 73.0%
Product Type Distribution:
- Personal Loan: 8408 (49.7%)
- Line of Credit: 6959 (41.1%)
- Credit Card: 1558 (9.2%)
Cluster 3 (15495 customers, 31.0%):
Credit Score: 653.9 ± 44.9
Interest Rate: 15.86% ± 1.94%
Loan Status (Approved): 60.4%
Product Type Distribution:
- Credit Card: 8364 (54.0%)
- Personal Loan: 5077 (32.8%)
- Line of Credit: 2054 (13.3%)
The code uses unsupervised machine learning to automatically discover 3 customer segments in loan data based on credit scores and interest rates. It's like finding natural groupings without being told what to look for.
The Algorithm (Expectation-Maximization):
- Starts with random guesses for 3 cluster centers
- E-Step: Calculates "how likely is each customer to belong to each cluster?"
- M-Step: Updates cluster parameters based on those probabilities
- Repeats until the clusters stabilize
What It Found:
- High-Risk Cluster (35%): Credit score ~585, interest rate ~19.4%, mostly credit cards, 33% approval
- Low-Risk Cluster (34%): Credit score ~695, interest rate ~11.2%, personal loans/LOCs, 73% approval
- Medium-Risk Cluster (31%): Credit score ~654, interest rate ~15.9%, mixed products, 60% approval
In [51]:
import matplotlib.pyplot as plt
import numpy as np
plt.figure(figsize=(8, 8))
# Create a 2D histogram
plt.hist2d(xf2, yf2, bins=50, cmap='viridis')
# Add a colorbar for reference
plt.colorbar(label='Count')
# Add labels and title
plt.xlabel('Interest rate')
plt.ylabel('Credit score')
plt.title('2D Histogram of interest rate and credit score')
# Display the plot
plt.show()
# this diagram shows a peak in one zone, around 600-700 score , this is the same information we saw on the 3D diagram before.
# There is still a secondary peak, lower, in the other cloud.
Experiment 3 - Cluster-Weighted Modeling (WORK IN PROGRESS..)¶
In [127]:
import matplotlib.pyplot as plt
import numpy as np
#
# state cluster-weighted modeling parameters
#
dim = 2
nstates = 2
nclusters = 5
nsamples = 100
momentum = 0.9
rate = 0.1
min_var = 0.1
niterations = 100
#
# load data: NOTE: in this experiment, I selected two other features: age and interest rate
#
feature1 = 'interest_rate'
feature1_lim = [0,25]
feature2 = 'age'
feature2_lim = [15,70]
df_ones = df[df['loan_status']== 1]
zero_states = np.stack((df_ones[feature1].to_numpy(), df_ones[feature2].to_numpy()), axis=0)
nzeros = zero_states.shape[1]
print('nzeros: ', nzeros)
print('Count of denied loans: ' ,len(zero_states[0]))
one_states = np.stack((df_zeroes[feature1].to_numpy(),df_zeroes[feature2].to_numpy()), axis=0)
print('Count of granted loans: ', len(one_states[0]))
nones = one_states.shape[1]
print('nones: ', nones)
print('X - min zero_states: ',min(zero_states[0]), ' max zero_states: ',max(zero_states[0]), ' - min one_states: ',min(one_states[0]),'max one_states: ',max(one_states[0]))
print('Y - min zero_states: ',min(zero_states[1]), ' max zero_states: ',max(zero_states[1]), ' - min one_states: ',min(one_states[1]),'max one_states: ',max(one_states[1]))
points = np.hstack((zero_states,one_states))
states = np.append(np.zeros(nzeros,dtype=int),np.ones(nones,dtype=int))
npts = len(states)
print('npts: ', npts)
nzeros: 27523 Count of denied loans: 27523 Count of granted loans: 22477 nones: 22477 X - min zero_states: 6.0 max zero_states: 23.0 - min one_states: 6.03 max one_states: 23.0 Y - min zero_states: 18.0 max zero_states: 70.0 - min one_states: 18.0 max one_states: 70.0 npts: 50000
In [128]:
#
# initialize arrays
#
indices = np.random.randint(0,npts,nclusters)
means = points[:,indices]
dmean = np.zeros((dim,nclusters))
covariances = np.zeros((nclusters,2,2))
covariances[:,0,0] = np.var(points[0,:])
covariances[:,1,1] = np.var(points[1,:])
pc = np.ones(nclusters)/nclusters # p(c)
pxgc = np.zeros((nclusters,npts)) # p(x|c)
psgxc = np.ones((nclusters,nstates))/nstates # p(s|xc)
#
# plot data
#
plt.plot(zero_states[0,:],zero_states[1,:],ls='',marker='.',markeredgecolor='green')
plt.plot(one_states[0,:],one_states[1,:],ls='',marker='.',markeredgecolor='orange')
plt.xlim(feature1_lim)
plt.ylim(feature2_lim)
#
# plot clusters
#
def plot_covar():
for cluster in range(means.shape[1]):
x = means[0,cluster]
y = means[1,cluster]
type = np.argmax(psgxc[cluster,:])
w,v = np.linalg.eig(covariances[cluster])
w = np.sqrt(w)
plt.plot([x-w[0]*v[0,0],x+w[0]*v[0,0]],[y-w[0]*v[1,0],y+w[0]*v[1,0]],'k')
plt.plot([x-w[1]*v[0,1],x+w[1]*v[0,1]],[y-w[1]*v[1,1],y+w[1]*v[1,1]],'k')
plt.plot([x],[y],'k',marker=f'${type}$',markersize=15)
plot_covar()
plt.title('covariance clusters before iteration')
plt.show()
#
# do E-M iteration
#
for i in range(niterations):
for cluster in range(nclusters):
mean = np.outer(means[:,cluster],np.ones(npts))
cinv = np.linalg.pinv(covariances[cluster])
pxgc[cluster,:] = (np.sqrt(np.linalg.det(cinv))/(2*np.pi)**(dim/2))\
*np.exp(-np.sum((points-mean)*(cinv@(points-mean)),0)/2)
pxc = pxgc*np.outer(pc,np.ones(npts))
pcgx = pxc/np.outer(np.ones(nclusters),np.sum(pxc,0))
psxc = psgxc[:,states]*pxc
pcgsx = psxc/np.outer(np.ones(nclusters),np.sum(psxc,0))
pc = np.sum(pcgsx,1)/npts
for cluster in range(nclusters):
newmean = momentum*dmean[:,cluster]\
+np.sum(points*np.outer(np.ones(dim),pcgsx[cluster,:]),1)\
/np.sum(np.outer(np.ones(dim),pcgsx[cluster,:]),1)
dmean[:,cluster] = newmean-means[:,cluster]
means[:,cluster] += rate*dmean[:,cluster]
m = np.outer(means[:,cluster],np.ones(npts))
for d in range(dim):
covariances[cluster][:,d] = np.sum(np.outer(\
np.ones(dim),points[d,:]-m[d,:])*(points-m)*\
np.outer(np.ones(dim),pcgsx[cluster,:]),1)\
/np.sum(np.outer(np.ones(dim),pcgsx[cluster,:]),1)\
+min_var
for state in range(nstates):
index = np.argwhere(states == state)
psgxc[:,state] = np.sum(pcgsx[:,index],1)[:,0]/np.sum(pcgsx[:,:],1)
#
# plot data
#
plt.plot(zero_states[0,:],zero_states[1,:],ls='',marker='$0$',markeredgecolor='green')
plt.plot(one_states[0,:],one_states[1,:],ls='',marker='$1$',markeredgecolor='orange')
plt.xlim(feature1_lim)
plt.ylim(feature2_lim)
#
# plot clusters
#
plot_covar()
#
# plot decision boundary
#
ngrid = 100
xmin = np.min(points[0,:])
xmax = np.max(points[0,:])
ymin = np.min(points[1,:])
ymax = np.max(points[1,:])
x = np.linspace(xmin,xmax,ngrid)
y = np.linspace(ymin,ymax,ngrid)
mx,my = np.meshgrid(x,y)
x = np.reshape(mx,(ngrid*ngrid))
y = np.reshape(my,(ngrid*ngrid))
plotpoints = np.vstack((x,y))
pxgc = np.zeros((nclusters,ngrid*ngrid))
for cluster in range(nclusters):
mean = np.outer(means[:,cluster],np.ones(ngrid*ngrid))
cinv = np.linalg.pinv(covariances[cluster])
pxgc[cluster,:] = (np.sqrt(np.linalg.det(cinv))/(2*np.pi)**(dim/2))\
*np.exp(-np.sum((plotpoints-mean)*(cinv@(plotpoints-mean)),0)/2)
pxc = pxgc*np.outer(pc,np.ones(ngrid*ngrid))
p = np.sum(np.outer(psgxc[:,0],np.ones(ngrid*ngrid))*pxc,0)/np.sum(pxc,0)
p = np.reshape(p,(ngrid,ngrid))
plt.contour(mx,my,p,[0.5])
plt.title('covariance clusters and decision boundaries after iteration')
plt.show()
#
# plot probability surface
#
fig, ax = plt.subplots(subplot_kw={"projection":"3d"})
ax.plot_wireframe(mx,my,p,rstride=10,cstride=10,color='gray')
plt.plot(zero_states[0,:],zero_states[1,:],zdir='z',ls='',marker='r$0$',markeredgecolor='green')
plt.plot(one_states[0,:],one_states[1,:],zdir='z',ls='',marker='$1$',markeredgecolor='orange')
plt.title('probability of state 0')
plt.show()
ok, I'm lost...
In [132]:
import matplotlib.pyplot as plt
import numpy as np
feature1 = 'interest_rate'
feature2 = 'age'
df_denied = df[df['loan_status']== 0]
plt.figure(figsize=(8, 8))
# Create a 2D histogram
plt.hist2d(df_denied[feature1], df_denied[feature2], bins=20, cmap='viridis')
# Add a colorbar for reference
plt.colorbar(label='Count - denied loans')
# Add labels and title
plt.xlabel('Interest rate')
plt.ylabel('Age')
plt.title('2D Histogram of interest rate and age for denied loans')
# Display the plot
feature1_lim = [5,25]
feature2_lim = [15,70]
plt.xlim(feature1_lim)
plt.ylim(feature2_lim)
plt.show()
In [133]:
import matplotlib.pyplot as plt
import numpy as np
feature1 = 'interest_rate'
feature2 = 'age'
df_granted = df[df['loan_status']== 1]
plt.figure(figsize=(8, 8))
# Create a 2D histogram
plt.hist2d(df_granted[feature1], df_granted[feature2], bins=20, cmap='viridis')
# Add a colorbar for reference
plt.colorbar(label='Count - granted loans')
# Add labels and title
plt.xlabel('Interest rate')
plt.ylabel('Age')
plt.title('2D Histogram of interest rate and age for granted loans')
# Display the plot
feature1_lim = [5,25]
feature2_lim = [15,70]
plt.xlim(feature1_lim)
plt.ylim(feature2_lim)
plt.show()
In [ ]: