import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Load the data
df = pd.read_csv('datasets/data.csv')

# Remove the header row from the data if needed (the first row is actually a title)
# Let's check if there's a title row issue
print("First few rows of data:")
print(df.head())

# If there's an issue with the first row being a title, we can fix it
if df.columns[0] == 'percentage of alcohol consumption by dzongkhag wise':
    # Read the CSV again with proper handling
    df = pd.read_csv('datasets/data.csv', skiprows=1)

# Ensure the percentage column is numeric
df['percentage'] = pd.to_numeric(df['percentage'], errors='coerce')

# Get the percentage data for density estimation
data = df['percentage'].dropna().values

# Create a range of values for the density plot
x_range = np.linspace(min(data) - 5, max(data) + 5, 1000)

# Perform kernel density estimation
kde = stats.gaussian_kde(data)

# Calculate density values
density = kde(x_range)

# Create visualization
plt.figure(figsize=(12, 6))

# Plot 1: Histogram with KDE overlay
plt.subplot(1, 2, 1)
plt.hist(data, bins=10, density=True, alpha=0.5, color='skyblue', edgecolor='black', label='Histogram')
plt.plot(x_range, density, color='red', linewidth=2, label='KDE')
plt.xlabel('Alcohol Consumption Percentage')
plt.ylabel('Density')
plt.title('Density Estimation of Alcohol Consumption')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 2: Just the KDE curve
plt.subplot(1, 2, 2)
plt.plot(x_range, density, color='darkblue', linewidth=3)
plt.fill_between(x_range, density, alpha=0.3, color='blue')
plt.xlabel('Alcohol Consumption Percentage')
plt.ylabel('Density')
plt.title('Kernel Density Estimate (KDE)')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print statistical summary
print("\n" + "="*50)
print("STATISTICAL SUMMARY")
print("="*50)
print(f"Number of observations: {len(data)}")
print(f"Mean: {np.mean(data):.2f}")
print(f"Standard deviation: {np.std(data):.2f}")
print(f"Minimum: {np.min(data):.2f}")
print(f"25th percentile: {np.percentile(data, 25):.2f}")
print(f"Median (50th percentile): {np.median(data):.2f}")
print(f"75th percentile: {np.percentile(data, 75):.2f}")
print(f"Maximum: {np.max(data):.2f}")

# Calculate and display bandwidth (smoothing parameter)
print(f"\nKDE Bandwidth: {kde.factor * np.std(data):.3f}")

# Find peaks in the density (modes)
from scipy.signal import find_peaks
peaks, _ = find_peaks(density)
if len(peaks) > 0:
    print(f"\nDetected {len(peaks)} peak(s) in the density at:")
    for i, peak in enumerate(peaks[:3]):  # Show up to 3 main peaks
        print(f"  Peak {i+1}: ~{x_range[peak]:.1f}% consumption")

First few rows of data:
  Dzongkhag  percentage
0  bumthang        21.8
1    chukha        30.7
2    dagana        31.3
3      gasa        23.8
4       haa        27.9

==================================================
STATISTICAL SUMMARY
==================================================
Number of observations: 20
Mean: 33.64
Standard deviation: 7.20
Minimum: 21.80
25th percentile: 29.68
Median (50th percentile): 33.55
75th percentile: 37.10
Maximum: 50.60

KDE Bandwidth: 3.954

Detected 1 peak(s) in the density at:
  Peak 1: ~33.0% consumption