import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("datasets/Loan_approval_data_2025.csv", delimiter=',', encoding='ascii')

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)
        
print("Dataset shape:", df.shape)

Dataset shape: (50000, 20)

df.head()

data = [('Column name','Mean','Standard deviation')]
for idx, col in enumerate(numeric_cols):
    data.append([col, df[col].mean(), df[col].std()])

col_widths = [max(len(str(item)) for item in col) for col in zip(*data)]
for row in data:
    formatted_row = [str(item).ljust(col_widths[i]) for i, item in enumerate(row)]
    print("  ".join(formatted_row))

Column name              Mean                 Standard deviation 
age                      34.95706             11.118602817934459 
years_employed           7.454868             7.612096740249689  
annual_income            50062.89204          32630.501014124966 
credit_score             643.61482            64.73151828712788  
credit_history_years     8.168274             7.207552305542376  
savings_assets           3595.6194            13232.399397651972 
current_debt             14290.44222          13243.757492939529 
defaults_on_file         0.05348              0.22499089318908017
delinquencies_last_2yrs  0.55464              0.8450495562833942 
derogatory_marks         0.14764              0.4129961763947325 
loan_amount              33041.874            26116.185101786836 
interest_rate            15.4985908           4.06794197023421   
debt_to_income_ratio     0.28572416           0.1597865231706192 
loan_to_income_ratio     0.7019986600000001   0.4657875213640885 
payment_to_income_ratio  0.23399493999999998  0.15526809690994003
loan_status              0.55046              0.4974522465270163

# Show how the feaures are distributed 
plt.figure(figsize=(12, 8))
for idx, col in enumerate(numeric_cols):
    plt.subplot(4, 4, idx+1)
    sns.histplot(df[col], kde=True, bins=30)        
    plt.title(col)
plt.tight_layout()
plt.show()

from scipy.stats import norm 
# Plotting the histogram.
plt.hist(df['credit_score'], bins=30, density=True, alpha=0.6, color='b')

# Fit a normal distribution to the data and get mean and standard deviation
mu, std = norm.fit(df['credit_score']) 
print('mu: ', mu, ' std: ', std)

# Plot the PDF.
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'r', linewidth=2)

# Plot the second one
mu2 = df['credit_score'].mean()
std2 = df['credit_score'].std()
p2 = norm.pdf(x, mu2, std2)
plt.plot(x, p, 'g', linewidth=2)

title = "You should not see any red line here"
plt.title(title)

plt.show()

mu:  643.61482  std:  64.73087096870859

# Heatmap displays data as a grid of colored squares. Each cell in the grid corresponds to the intersection of two variables 
# (one on the x-axis, one on the y-axis) or two categories.
# Heatmaps are frequently used to visualize correlation matrices, where each cell's color represents the correlation coefficient between two variables. 
# This helps identify strong positive or negative correlations and independent variables.

numeric_df = df.select_dtypes(include=[np.number])
if len(numeric_df.columns) >= 4:
    plt.figure(figsize=(10, 8))
    corr = numeric_df.corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap')
    plt.show()

# Covariance measures how two variables change together, indicating direction, while correlation is a standardized version of covariance 
# that measures both the direction and strength of a linear relationship on a scale of -1 to +1
# Covariance can range from negative to positive infinity and its value is affected by the scale of the variables, whereas correlation is 
# dimensionless and not affected by scale.

# Calculate the covariance matrix between age and credit score
cov_matrix = np.cov(df['age'], df['credit_score'])

print("Covariance Matrix:")
print(cov_matrix)

# Extract the covariance between x and y
covariance_xy = cov_matrix[0, 1]
print(f"\nCovariance between x and y: {covariance_xy}")

# To interpret a covariance matrix, look at the diagonal elements for variance (how much each variable spreads out) and the off-diagonal elements
# for covariance (how variables change together). Positive off-diagonal values indicate that variables tend to increase and decrease together,
# while negative values mean they move in opposite directions. Values close to zero suggest little linear relationship

# Upper-left cell = variance of age feature
# Lower-right cell = variance of credit score feature
# Other cells = covariance between age and credit score
#               A positive number for covariance indicates that two variables tend to increase or decrease in tandem. 
#               I guess it makes sense in the real world, since your credit score is lower when you are young. (Note: when you are very old as well...)

Covariance Matrix:
[[ 123.62332862  265.77305583]
 [ 265.77305583 4190.16945976]]

Covariance between x and y: 265.7730558319165

# Try to identify a trend
x = df['age']
xmin = x.min()
xmax = x.max()
npts = x.count()
y = df['credit_score']
coeff1 = np.polyfit(x,y,1) # fit first-order polynomial
xfit = np.arange(xmin,xmax,(xmax-xmin)/npts)
pfit1 = np.poly1d(coeff1)
yfit1 = pfit1(xfit) # evaluate first-order fit
print(f"first-order fit coefficients: {coeff1}")
plt.plot(x,y,'o')
plt.plot(xfit,yfit1,'r-',label='Trend - first-order')
plt.legend()
plt.show()

first-order fit coefficients: [  2.14986167 568.46197658]

import numpy as np
import matplotlib.pyplot as plt
np.random.seed(10)
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)

feature1 = 'age'  
feature2 = 'credit_score'   # credit_score 
#
# load data
#
data = df[[feature1, feature2]] 
#print(data.columns)
#
# find mean, covariance, eigenvalues, and eigenvectors
#
covarmean = np.mean(data,axis=0)
print('covarmean: ', covarmean)
covar = np.cov(data,rowvar=False)
evalu,evect = np.linalg.eig(covar)   # eigenvector tells us which direction the distribution points
dx0 = evect[0,0]*np.sqrt(evalu[0])
dx1 = evect[1,0]*np.sqrt(evalu[1])
dy0 = evect[0,1]*np.sqrt(evalu[0])
dy1 = evect[1,1]*np.sqrt(evalu[1])
covarplotx = [covarmean.iloc[0]-dx0,covarmean.iloc[0]+dx0,None,covarmean.iloc[0]-dx1,covarmean.iloc[0]+dx1]
print('covarplotx: ', covarplotx)
covarploty = [covarmean.iloc[1]+dy0,covarmean.iloc[1]-dy0,None,covarmean.iloc[1]+dy1,covarmean.iloc[1]-dy1]
print('covarploty: ', covarploty)
#
# plot and print
#
print("covariance matrix:")
print(covar)
plt.figure()
plt.hist2d(data[feature1],data[feature2],bins=30,cmap='viridis')
plt.plot(data[feature1],data[feature2],'o',markersize=1.5,alpha=0.3)
plt.plot(covarmean.iloc[0],covarmean.iloc[1],'ro')
plt.plot(covarplotx,covarploty,'r')
#plt.axis('off')
plt.show()

covarmean:  age              34.95706
credit_score    643.61482
dtype: float64
covarplotx:  [np.float64(45.2467933563304), np.float64(24.667326643669597), None, np.float64(30.74461224618814), np.float64(39.16950775381186)]
covarploty:  [np.float64(642.9451727421691), np.float64(644.2844672578309), None, np.float64(578.8867655544439), np.float64(708.3428744455562)]
covariance matrix:
[[ 123.62  265.77]
 [ 265.77 4190.17]]

x = df['credit_score']
y = df['interest_rate']
plt.ylim(0, 25)
plt.plot(x,y,'o')
plt.show()

# There are two clouds, but why ?

# There are different financial products, could it be related to that ? 
print(df['product_type'].unique())

categories_to_keep = ['NO_Personal Loan', 'Credit Card', 'NO_Line of Credit']
filtered_df_multiple = df[df['product_type'].isin(categories_to_keep)]

x1 = filtered_df_multiple['credit_score']
y1 = filtered_df_multiple['interest_rate'] 

plt.ylim(0, 25)
plt.plot(x1,y1,'o')
plt.show()

# So, yes, credit card are expensive !

['Credit Card' 'Personal Loan' 'Line of Credit']

categories_to_keep = ['Personal Loan', 'NO_Credit Card', 'Line of Credit']
filtered_df_multiple = df[df['product_type'].isin(categories_to_keep)]

x1 = filtered_df_multiple['credit_score']
y1 = filtered_df_multiple['interest_rate'] 

plt.ylim(0, 25)
plt.plot(x1,y1,'o')
plt.show()

import numpy as np
import matplotlib.pyplot as plt
np.random.seed(10)
np.set_printoptions(precision=1)
np.set_printoptions(suppress=True)
npts = df['age'].count()
nbins = 256
print(f"{nbins} bins\n")
#
def entropy(dist):
    index = np.where(dist > 0) # 0 log(0) = 0
    positives = dist[index]
    return -np.sum(positives*np.log2(positives))
def entropy2(dist):
    indexx,indexy = np.where(dist > 0) # 0 log(0) = 0
    positives = dist[indexx,indexy]
    return -np.sum(positives*np.log2(positives))
def information(x,y):
    xhist,xedges = np.histogram(x,nbins)
    xdist = xhist/np.sum(xhist)
    yhist,yedges = np.histogram(y,nbins)
    ydist = yhist/np.sum(yhist)
    xyhist,xedges,yedges = np.histogram2d(x,y,[nbins,nbins])
    xydist = xyhist/np.sum(xyhist)
    Hx = entropy(xdist)
    Hy = entropy(ydist)
    Hxy = entropy2(xydist)
    return Hx+Hy-Hxy
#
# Normalize data
#
xuniform = (df['age'] - df['age'].min()) / (df['age'].max() - df['age'].min())
yuniform = (df['credit_score'] - df['credit_score'].min()) / (df['credit_score'].max() - df['credit_score'].min())
#
# Main
#
covar = np.cov(np.c_[xuniform,yuniform],rowvar=False)
print(f"{npts:.0e} points")
print(f"uniform covariance:\n{covar}")
I = information(xuniform,yuniform)
plt.plot(xuniform,yuniform,'o')
plt.title(f"uniform mutual information: {I:.1f} bits")
plt.show()

256 bins

5e+04 points
uniform covariance:
[[0. 0.]
 [0. 0.]]

	customer_id	age	occupation_status	years_employed	annual_income	credit_score	credit_history_years	savings_assets	current_debt	delinquencies_last_2yrs	product_type	loan_intent	loan_amount	interest_rate	debt_to_income_ratio	loan_to_income_ratio	payment_to_income_ratio	loan_status
0	CUST100000	40	Employed	17.2	25579	692	5.3	895	10820	0	Credit Card	Business	600	17.02	0.423	0.023	0.008	1
1	CUST100001	33	Employed	7.3	43087	627	3.5	169	16550	1	Personal Loan	Home Improvement	53300	14.10	0.384	1.237	0.412	0
2	CUST100002	42	Student	1.1	20840	689	8.4	17	7852	0	Credit Card	Debt Consolidation	2100	18.33	0.377	0.101	0.034	1
3	CUST100003	53	Student	0.5	29147	692	9.8	1480	11603	1	Credit Card	Business	2900	18.74	0.398	0.099	0.033	1
4	CUST100004	32	Employed	12.5	63657	630	7.2	209	12424	0	Personal Loan	Education	99600	13.92	0.195	1.565	0.522	1

Week 5: probability - "Loan approval" dataset¶

Context¶

Load dataset¶

Explore content¶

Experiment 1 - statistics¶

Experiment 2 - distributions and modeling¶

Experiment 3 - Multidimensional distributions¶

Experiment 4 - I don't know how to call this..¶

Experiment 5 - Entropy¶