import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("datasets/Loan_approval_data_2025.csv", delimiter=',', encoding='ascii')
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# 🧾 Display dataset informations
print("Dataset shape:", df.shape)

Dataset shape: (50000, 20)

df.head()

plt.figure()
sns.histplot(df['debt_to_income_ratio'], kde=True, bins=30) # According to the documentation, when the KDE paramter is True, it computes a kernel density estimate to smooth the distribution and show on the plot as (one or more) line(s). Only relevant with univariate data.
plt.title('debt_to_income_ratio')
plt.tight_layout()
plt.show()

# Sort data by debt_to_income_rate, Create 30 equal buckets, split the data and count 
df_sorted = df.sort_values(by='debt_to_income_ratio')
df['bucket'] = pd.cut(df_sorted['debt_to_income_ratio'], bins=30)
#counts_equal_width = bins_equal_width.value_counts().sort_index()
results = df.groupby('bucket',observed=False).agg(
    count=('debt_to_income_ratio', 'size'), # 'size' counts all items, including NaNs, in the group
    average_value=('debt_to_income_ratio', 'mean') # 'mean' calculates the average
)
print(results.shape)
print(results.head(5))

(30, 2)
                  count  average_value
bucket                                
(0.0012, 0.0286]    575       0.018591
(0.0286, 0.0552]   1451       0.042909
(0.0552, 0.0818]   1958       0.069200
(0.0818, 0.108]    2545       0.095508
(0.108, 0.135]     2987       0.122321

# Let's display a basic chart
plt.plot(results['average_value'], results['count'],'o')
plt.xlabel('Average value per category')
plt.ylabel('Count')
plt.show()

# Try to fit a function 
x = results['average_value']
xmin = x.min()
xmax = x.max()
npts = x.count()
y = results['count']
coeff1 = np.polyfit(x,y,1) # fit first-order polynomial
coeff2 = np.polyfit(x,y,2) # fit second-order polynomial
coeff3 = np.polyfit(x,y,3) # fit third-order polynomial
coeff4 = np.polyfit(x,y,4) # fit fourth-order polynomial
xfit = np.arange(xmin,xmax,(xmax-xmin)/npts)
pfit1 = np.poly1d(coeff1)
yfit1 = pfit1(xfit) # evaluate first-order fit
print(f"first-order fit coefficients: {coeff1}")
pfit2 = np.poly1d(coeff2)
yfit2 = pfit2(xfit) # evaluate second-order fit
print(f"second-order fit coefficients: {coeff2}")
pfit3 = np.poly1d(coeff3)
yfit3 = pfit3(xfit) # evaluate third-order fit
print(f"third-order fit coefficients: {coeff3}")
pfit4 = np.poly1d(coeff4)
yfit4 = pfit4(xfit) # evaluate fourth-order fit
print(f"fourth-order fit coefficients: {coeff4}")
plt.plot(x,y,'o')
plt.plot(xfit,yfit1,'g-',label='first-order')
plt.plot(xfit,yfit2,'r-',label='second-order')
plt.plot(xfit,yfit3,'y-',label='third-order')
plt.plot(xfit,yfit4,'b-',label='fourth-order')
plt.legend()
plt.show()

first-order fit coefficients: [-3634.87677097  3125.16103026]
second-order fit coefficients: [-10284.81091016   4641.92738261   2004.76081151]
third-order fit coefficients: [ 55076.80835688 -76998.11717826  26319.61345257    504.83893844]
fourth-order fit coefficients: [-7.96009487e+04  1.84026230e+05 -1.44460272e+05  3.86583105e+04
 -2.46815666e+01]

	customer_id	age	occupation_status	years_employed	annual_income	credit_score	credit_history_years	savings_assets	current_debt	delinquencies_last_2yrs	product_type	loan_intent	loan_amount	interest_rate	debt_to_income_ratio	loan_to_income_ratio	payment_to_income_ratio	loan_status
0	CUST100000	40	Employed	17.2	25579	692	5.3	895	10820	0	Credit Card	Business	600	17.02	0.423	0.023	0.008	1
1	CUST100001	33	Employed	7.3	43087	627	3.5	169	16550	1	Personal Loan	Home Improvement	53300	14.10	0.384	1.237	0.412	0
2	CUST100002	42	Student	1.1	20840	689	8.4	17	7852	0	Credit Card	Debt Consolidation	2100	18.33	0.377	0.101	0.034	1
3	CUST100003	53	Student	0.5	29147	692	9.8	1480	11603	1	Credit Card	Business	2900	18.74	0.398	0.099	0.033	1
4	CUST100004	32	Employed	12.5	63657	630	7.2	209	12424	0	Personal Loan	Education	99600	13.92	0.195	1.565	0.522	1

Week 3: fitting - "Loan approval" dataset¶

Context¶

Load dataset¶

Explore content¶

We want to achieve something similar to the curve generated by the KDE parameter in this histogram.¶

To start, we need similar data (i.e 30 buckets with an average value on the x axis and a count on the y axis)¶

Then we try to fit a function.. and it makes sense after the fourth attempt !¶