Fitting a Probability Distribution to my data¶
In [5]:
df = pd.read_csv("datasets/enhanced_student_habits_performance_dataset.csv")
In [9]:
import matplotlib.pyplot as plt
import seaborn as sns
numeric_cols = df.select_dtypes(include='number').columns
print("Numeric Columns Being Analyzed:")
print(numeric_cols)
for col in numeric_cols:
plt.figure(figsize=(10, 5))
sns.histplot(df[col], kde=True, bins=30)
plt.title(f"Probability Distribution of {col}")
plt.xlabel(col)
plt.ylabel("Frequency")
plt.show()
Numeric Columns Being Analyzed:
Index(['student_id', 'age', 'study_hours_per_day', 'social_media_hours',
'netflix_hours', 'attendance_percentage', 'sleep_hours',
'exercise_frequency', 'mental_health_rating', 'previous_gpa',
'semester', 'stress_level', 'social_activity', 'screen_time',
'parental_support_level', 'motivation_level', 'exam_anxiety_score',
'time_management_score', 'exam_score'],
dtype='object')
In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
# Choose one factor (example: 'study_hours_per_day')
column = 'study_hours_per_day'
plt.figure(figsize=(10, 5))
sns.histplot(df[column], kde=True, bins=30)
plt.title(f"Probability Distribution of {column}")
plt.xlabel(column)
plt.ylabel("Frequency")
plt.show()
Density Estimation¶
In [11]:
import matplotlib.pyplot as plt
import seaborn as sns
# Choose one factor
column = 'exam_score' # <-- change this to any numeric column
plt.figure(figsize=(10, 5))
sns.kdeplot(df[column], fill=True, linewidth=2)
plt.title(f"Density Estimation (KDE) for {column}")
plt.xlabel(column)
plt.ylabel("Density")
plt.show()
KDE shows how your data is distributed by creating a smooth curve over the histogram. Instead of assuming a fixed shape (like normal distribution), KDE lets the data decide its own shape.
In [12]:
plt.figure(figsize=(10,5))
sns.histplot(df[column], kde=True, bins=30)
plt.title(f"Histogram + KDE for {column}")
plt.xlabel(column)
plt.ylabel("Frequency / Density")
plt.show()
In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import numpy as np
# -------------------------------------
# 1. Load your dataset (IMPORTANT STEP)
# -------------------------------------
df = pd.read_csv("datasets/enhanced_student_habits_performance_dataset.csv")
# -------------------------------------
# 2. Choose the column for KDE
# -------------------------------------
column = 'study_hours_per_day' # change to any numeric column you want
data = df[column].dropna()
# -------------------------------------
# 3. Compute KDE
# -------------------------------------
kde = gaussian_kde(data)
x_range = np.linspace(data.min(), data.max(), 500)
density = kde(x_range)
# -------------------------------------
# 4. Plot KDE (with pastel colors)
# -------------------------------------
plt.figure(figsize=(10, 5))
plt.plot(x_range, density, linewidth=3, color='lightcoral')
plt.fill_between(x_range, density, alpha=0.3, color='mistyrose')
plt.title(f"KDE Density Estimation: {column}", fontsize=14)
plt.xlabel(column)
plt.ylabel("Density")
plt.grid(alpha=0.2)
plt.show()
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 1. Load your dataset
df = pd.read_csv("datasets/enhanced_student_habits_performance_dataset.csv")
# 2. Select the column
column = "study_hours_per_day"
data = df[column].dropna()
# 3. Plot Histogram + KDE
plt.figure(figsize=(10, 5))
sns.histplot(data, kde=True, bins=30, edgecolor="black", alpha=0.6)
plt.title(f"Histogram + KDE for {column}")
plt.xlabel(column)
plt.ylabel("Frequency / Density")
plt.tight_layout()
plt.show()
In [ ]: