In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
# Load the data
df = pd.read_csv('datasets/data.csv')
# Remove the header row from the data if needed (the first row is actually a title)
# Let's check if there's a title row issue
print("First few rows of data:")
print(df.head())
# If there's an issue with the first row being a title, we can fix it
if df.columns[0] == 'percentage of alcohol consumption by dzongkhag wise':
# Read the CSV again with proper handling
df = pd.read_csv('datasets/data.csv', skiprows=1)
# Ensure the percentage column is numeric
df['percentage'] = pd.to_numeric(df['percentage'], errors='coerce')
# Get the percentage data for density estimation
data = df['percentage'].dropna().values
# Create a range of values for the density plot
x_range = np.linspace(min(data) - 5, max(data) + 5, 1000)
# Perform kernel density estimation
kde = stats.gaussian_kde(data)
# Calculate density values
density = kde(x_range)
# Create visualization
plt.figure(figsize=(12, 6))
# Plot 1: Histogram with KDE overlay
plt.subplot(1, 2, 1)
plt.hist(data, bins=10, density=True, alpha=0.5, color='skyblue', edgecolor='black', label='Histogram')
plt.plot(x_range, density, color='red', linewidth=2, label='KDE')
plt.xlabel('Alcohol Consumption Percentage')
plt.ylabel('Density')
plt.title('Density Estimation of Alcohol Consumption')
plt.legend()
plt.grid(True, alpha=0.3)
# Plot 2: Just the KDE curve
plt.subplot(1, 2, 2)
plt.plot(x_range, density, color='darkblue', linewidth=3)
plt.fill_between(x_range, density, alpha=0.3, color='blue')
plt.xlabel('Alcohol Consumption Percentage')
plt.ylabel('Density')
plt.title('Kernel Density Estimate (KDE)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Print statistical summary
print("\n" + "="*50)
print("STATISTICAL SUMMARY")
print("="*50)
print(f"Number of observations: {len(data)}")
print(f"Mean: {np.mean(data):.2f}")
print(f"Standard deviation: {np.std(data):.2f}")
print(f"Minimum: {np.min(data):.2f}")
print(f"25th percentile: {np.percentile(data, 25):.2f}")
print(f"Median (50th percentile): {np.median(data):.2f}")
print(f"75th percentile: {np.percentile(data, 75):.2f}")
print(f"Maximum: {np.max(data):.2f}")
# Calculate and display bandwidth (smoothing parameter)
print(f"\nKDE Bandwidth: {kde.factor * np.std(data):.3f}")
# Find peaks in the density (modes)
from scipy.signal import find_peaks
peaks, _ = find_peaks(density)
if len(peaks) > 0:
print(f"\nDetected {len(peaks)} peak(s) in the density at:")
for i, peak in enumerate(peaks[:3]): # Show up to 3 main peaks
print(f" Peak {i+1}: ~{x_range[peak]:.1f}% consumption")
First few rows of data: Dzongkhag percentage 0 bumthang 21.8 1 chukha 30.7 2 dagana 31.3 3 gasa 23.8 4 haa 27.9
================================================== STATISTICAL SUMMARY ================================================== Number of observations: 20 Mean: 33.64 Standard deviation: 7.20 Minimum: 21.80 25th percentile: 29.68 Median (50th percentile): 33.55 75th percentile: 37.10 Maximum: 50.60 KDE Bandwidth: 3.954 Detected 1 peak(s) in the density at: Peak 1: ~33.0% consumption
In [ ]: