The graphs show the probability distribution of each numeric column in the StudentsPerformance dataset.¶
For every numeric variable, the code creates a histogram and a fitted normal distribution curve.
In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import norm
# load data
df = pd.read_csv("datasets/StudentsPerformance.csv")
# get only numeric columns
num_cols = df.select_dtypes(include="number").columns
for col in num_cols:
data = df[col].dropna()
# fit normal distribution
mu, sigma = norm.fit(data)
# plot
plt.figure()
plt.hist(data, bins=10, density=True)
x = np.linspace(min(data), max(data), 100)
plt.plot(x, norm.pdf(x, mu, sigma))
plt.xlabel(col)
plt.ylabel("Probability Density")
plt.title(f"Probability Distribution Fit: {col}")
plt.show()
In [ ]: