import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set style for better visualization
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Read the CSV data
data = pd.read_csv('datasets/data.csv')

# Display basic information
print("Dataset Information:")
print(data.info())
print("\nFirst 5 rows:")
print(data.head())
print("\nDescriptive Statistics:")
print(data['percentage'].describe())

# Create figure with multiple subplots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Probability Distribution Analysis of Alcohol Consumption by Dzongkhag', 
             fontsize=16, fontweight='bold', y=1.02)

# 1. Histogram with KDE
ax1 = axes[0, 0]
sns.histplot(data=data, x='percentage', kde=True, bins=10, ax=ax1, 
             color='steelblue', edgecolor='black')
ax1.set_title('Histogram with KDE', fontweight='bold')
ax1.set_xlabel('Percentage (%)')
ax1.set_ylabel('Frequency')
ax1.axvline(data['percentage'].mean(), color='red', linestyle='--', 
            label=f'Mean: {data["percentage"].mean():.1f}%')
ax1.axvline(data['percentage'].median(), color='green', linestyle='--', 
            label=f'Median: {data["percentage"].median():.1f}%')
ax1.legend()

# 2. Box plot
ax2 = axes[0, 1]
sns.boxplot(data=data, y='percentage', ax=ax2, color='lightcoral')
ax2.set_title('Box Plot', fontweight='bold')
ax2.set_ylabel('Percentage (%)')

# Add statistical annotations
stats_text = f"Q1: {data['percentage'].quantile(0.25):.1f}%\n"
stats_text += f"Q3: {data['percentage'].quantile(0.75):.1f}%\n"
stats_text += f"IQR: {data['percentage'].quantile(0.75) - data['percentage'].quantile(0.25):.1f}%\n"
stats_text += f"Range: {data['percentage'].max() - data['percentage'].min():.1f}%"
ax2.text(0.02, 0.98, stats_text, transform=ax2.transAxes, 
         verticalalignment='top', fontsize=9, 
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# 3. Violin plot
ax3 = axes[0, 2]
sns.violinplot(data=data, y='percentage', ax=ax3, color='lightgreen', inner='quartile')
ax3.set_title('Violin Plot', fontweight='bold')
ax3.set_ylabel('Percentage (%)')

# 4. ECDF (Empirical Cumulative Distribution Function)
ax4 = axes[1, 0]
sorted_data = np.sort(data['percentage'])
ecdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
ax4.plot(sorted_data, ecdf, marker='.', linestyle='none', color='purple')
ax4.set_title('Empirical CDF', fontweight='bold')
ax4.set_xlabel('Percentage (%)')
ax4.set_ylabel('Cumulative Probability')
ax4.grid(True, alpha=0.3)

# Add percentiles to ECDF
percentiles = [25, 50, 75, 90]
for p in percentiles:
    percentile_value = np.percentile(data['percentage'], p)
    ax4.axvline(percentile_value, color='orange', alpha=0.5, linestyle='--', linewidth=0.8)
    ax4.text(percentile_value + 0.5, 0.1, f'{p}%', fontsize=8)

# 5. Q-Q Plot for normality check
ax5 = axes[1, 1]
stats.probplot(data['percentage'], dist="norm", plot=ax5)
ax5.set_title('Q-Q Plot (Normality Check)', fontweight='bold')
ax5.get_lines()[0].set_marker('.')
ax5.get_lines()[0].set_markeredgecolor('blue')
ax5.get_lines()[0].set_markerfacecolor('blue')
ax5.get_lines()[1].set_color('red')

# Calculate and display skewness and kurtosis
skewness = data['percentage'].skew()
kurtosis = data['percentage'].kurtosis()
ax5.text(0.05, 0.95, f"Skewness: {skewness:.3f}\nKurtosis: {kurtosis:.3f}", 
         transform=ax5.transAxes, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.5))

# 6. Probability Density Function (PDF) using KDE
ax6 = axes[1, 2]
kde = stats.gaussian_kde(data['percentage'])
x_range = np.linspace(data['percentage'].min() - 5, data['percentage'].max() + 5, 1000)
pdf = kde.evaluate(x_range)
ax6.plot(x_range, pdf, 'b-', linewidth=2, label='KDE')
ax6.fill_between(x_range, pdf, alpha=0.3, color='blue')
ax6.set_title('Probability Density Function (KDE)', fontweight='bold')
ax6.set_xlabel('Percentage (%)')
ax6.set_ylabel('Density')

# Add summary statistics to the last plot
summary_text = f"Mean: {data['percentage'].mean():.2f}%\n"
summary_text += f"Std Dev: {data['percentage'].std():.2f}%\n"
summary_text += f"Min: {data['percentage'].min():.1f}%\n"
summary_text += f"Max: {data['percentage'].max():.1f}%\n"
summary_text += f"Mode: {data['percentage'].mode().values[0]:.1f}%"
ax6.text(0.02, 0.98, summary_text, transform=ax6.transAxes, 
         verticalalignment='top', fontsize=9,
         bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.5))

# Add a bar chart showing all dzongkhags (extra figure)
plt.figure(figsize=(14, 8))
bars = plt.barh(data['Dzongkhag'], data['percentage'], 
                color=plt.cm.viridis(data['percentage']/data['percentage'].max()))
plt.xlabel('Percentage (%)')
plt.title('Alcohol Consumption Percentage by Dzongkhag', fontweight='bold', fontsize=14)
plt.grid(axis='x', alpha=0.3)

# Add value labels to bars
for bar, value in zip(bars, data['percentage']):
    plt.text(value + 0.5, bar.get_y() + bar.get_height()/2, 
             f'{value}%', va='center', fontsize=9)

# Highlight extremes
max_idx = data['percentage'].idxmax()
min_idx = data['percentage'].idxmin()
bars[max_idx].set_color('red')
bars[min_idx].set_color('green')

plt.tight_layout()
plt.show()

# Print statistical summary
print("\n" + "="*60)
print("PROBABILITY DISTRIBUTION ANALYSIS SUMMARY")
print("="*60)
print(f"Mean: {data['percentage'].mean():.2f}%")
print(f"Median: {data['percentage'].median():.2f}%")
print(f"Mode: {data['percentage'].mode().values[0]:.2f}%")
print(f"Standard Deviation: {data['percentage'].std():.2f}%")
print(f"Variance: {data['percentage'].var():.2f}")
print(f"Range: {data['percentage'].max() - data['percentage'].min():.2f}%")
print(f"25th Percentile: {data['percentage'].quantile(0.25):.2f}%")
print(f"75th Percentile: {data['percentage'].quantile(0.75):.2f}%")
print(f"IQR: {data['percentage'].quantile(0.75) - data['percentage'].quantile(0.25):.2f}%")
print(f"Skewness: {skewness:.3f}")
print(f"Kurtosis: {kurtosis:.3f}")
print("\nExtreme Values:")
print(f"Highest: {data.loc[max_idx, 'Dzongkhag']} - {data.loc[max_idx, 'percentage']}%")
print(f"Lowest: {data.loc[min_idx, 'Dzongkhag']} - {data.loc[min_idx, 'percentage']}%")

# Check for normality using Shapiro-Wilk test
if len(data) < 5000:  # Shapiro-Wilk is only reliable for n < 5000
    stat, p_value = stats.shapiro(data['percentage'])
    print(f"\nNormality Test (Shapiro-Wilk):")
    print(f"  Test Statistic: {stat:.4f}")
    print(f"  p-value: {p_value:.4f}")
    if p_value > 0.05:
        print("  The data appears to be normally distributed (p > 0.05)")
    else:
        print("  The data does not appear to be normally distributed (p ≤ 0.05)")

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Dzongkhag   20 non-null     object 
 1   percentage  20 non-null     float64
dtypes: float64(1), object(1)
memory usage: 452.0+ bytes
None

First 5 rows:
  Dzongkhag  percentage
0  bumthang        21.8
1    chukha        30.7
2    dagana        31.3
3      gasa        23.8
4       haa        27.9

Descriptive Statistics:
count    20.000000
mean     33.645000
std       7.384798
min      21.800000
25%      29.675000
50%      33.550000
75%      37.100000
max      50.600000
Name: percentage, dtype: float64

============================================================
PROBABILITY DISTRIBUTION ANALYSIS SUMMARY
============================================================
Mean: 33.64%
Median: 33.55%
Mode: 21.80%
Standard Deviation: 7.38%
Variance: 54.54
Range: 28.80%
25th Percentile: 29.68%
75th Percentile: 37.10%
IQR: 7.43%
Skewness: 0.433
Kurtosis: 0.346

Extreme Values:
Highest: lhuentse - 50.6%
Lowest: bumthang - 21.8%

Normality Test (Shapiro-Wilk):
  Test Statistic: 0.9667
  p-value: 0.6838
  The data appears to be normally distributed (p > 0.05)