In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# Set style for better visualization
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
# Read the CSV data
data = pd.read_csv('datasets/data.csv')
# Display basic information
print("Dataset Information:")
print(data.info())
print("\nFirst 5 rows:")
print(data.head())
print("\nDescriptive Statistics:")
print(data['percentage'].describe())
# Create figure with multiple subplots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Probability Distribution Analysis of Alcohol Consumption by Dzongkhag',
fontsize=16, fontweight='bold', y=1.02)
# 1. Histogram with KDE
ax1 = axes[0, 0]
sns.histplot(data=data, x='percentage', kde=True, bins=10, ax=ax1,
color='steelblue', edgecolor='black')
ax1.set_title('Histogram with KDE', fontweight='bold')
ax1.set_xlabel('Percentage (%)')
ax1.set_ylabel('Frequency')
ax1.axvline(data['percentage'].mean(), color='red', linestyle='--',
label=f'Mean: {data["percentage"].mean():.1f}%')
ax1.axvline(data['percentage'].median(), color='green', linestyle='--',
label=f'Median: {data["percentage"].median():.1f}%')
ax1.legend()
# 2. Box plot
ax2 = axes[0, 1]
sns.boxplot(data=data, y='percentage', ax=ax2, color='lightcoral')
ax2.set_title('Box Plot', fontweight='bold')
ax2.set_ylabel('Percentage (%)')
# Add statistical annotations
stats_text = f"Q1: {data['percentage'].quantile(0.25):.1f}%\n"
stats_text += f"Q3: {data['percentage'].quantile(0.75):.1f}%\n"
stats_text += f"IQR: {data['percentage'].quantile(0.75) - data['percentage'].quantile(0.25):.1f}%\n"
stats_text += f"Range: {data['percentage'].max() - data['percentage'].min():.1f}%"
ax2.text(0.02, 0.98, stats_text, transform=ax2.transAxes,
verticalalignment='top', fontsize=9,
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
# 3. Violin plot
ax3 = axes[0, 2]
sns.violinplot(data=data, y='percentage', ax=ax3, color='lightgreen', inner='quartile')
ax3.set_title('Violin Plot', fontweight='bold')
ax3.set_ylabel('Percentage (%)')
# 4. ECDF (Empirical Cumulative Distribution Function)
ax4 = axes[1, 0]
sorted_data = np.sort(data['percentage'])
ecdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
ax4.plot(sorted_data, ecdf, marker='.', linestyle='none', color='purple')
ax4.set_title('Empirical CDF', fontweight='bold')
ax4.set_xlabel('Percentage (%)')
ax4.set_ylabel('Cumulative Probability')
ax4.grid(True, alpha=0.3)
# Add percentiles to ECDF
percentiles = [25, 50, 75, 90]
for p in percentiles:
percentile_value = np.percentile(data['percentage'], p)
ax4.axvline(percentile_value, color='orange', alpha=0.5, linestyle='--', linewidth=0.8)
ax4.text(percentile_value + 0.5, 0.1, f'{p}%', fontsize=8)
# 5. Q-Q Plot for normality check
ax5 = axes[1, 1]
stats.probplot(data['percentage'], dist="norm", plot=ax5)
ax5.set_title('Q-Q Plot (Normality Check)', fontweight='bold')
ax5.get_lines()[0].set_marker('.')
ax5.get_lines()[0].set_markeredgecolor('blue')
ax5.get_lines()[0].set_markerfacecolor('blue')
ax5.get_lines()[1].set_color('red')
# Calculate and display skewness and kurtosis
skewness = data['percentage'].skew()
kurtosis = data['percentage'].kurtosis()
ax5.text(0.05, 0.95, f"Skewness: {skewness:.3f}\nKurtosis: {kurtosis:.3f}",
transform=ax5.transAxes, verticalalignment='top',
bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.5))
# 6. Probability Density Function (PDF) using KDE
ax6 = axes[1, 2]
kde = stats.gaussian_kde(data['percentage'])
x_range = np.linspace(data['percentage'].min() - 5, data['percentage'].max() + 5, 1000)
pdf = kde.evaluate(x_range)
ax6.plot(x_range, pdf, 'b-', linewidth=2, label='KDE')
ax6.fill_between(x_range, pdf, alpha=0.3, color='blue')
ax6.set_title('Probability Density Function (KDE)', fontweight='bold')
ax6.set_xlabel('Percentage (%)')
ax6.set_ylabel('Density')
# Add summary statistics to the last plot
summary_text = f"Mean: {data['percentage'].mean():.2f}%\n"
summary_text += f"Std Dev: {data['percentage'].std():.2f}%\n"
summary_text += f"Min: {data['percentage'].min():.1f}%\n"
summary_text += f"Max: {data['percentage'].max():.1f}%\n"
summary_text += f"Mode: {data['percentage'].mode().values[0]:.1f}%"
ax6.text(0.02, 0.98, summary_text, transform=ax6.transAxes,
verticalalignment='top', fontsize=9,
bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.5))
# Add a bar chart showing all dzongkhags (extra figure)
plt.figure(figsize=(14, 8))
bars = plt.barh(data['Dzongkhag'], data['percentage'],
color=plt.cm.viridis(data['percentage']/data['percentage'].max()))
plt.xlabel('Percentage (%)')
plt.title('Alcohol Consumption Percentage by Dzongkhag', fontweight='bold', fontsize=14)
plt.grid(axis='x', alpha=0.3)
# Add value labels to bars
for bar, value in zip(bars, data['percentage']):
plt.text(value + 0.5, bar.get_y() + bar.get_height()/2,
f'{value}%', va='center', fontsize=9)
# Highlight extremes
max_idx = data['percentage'].idxmax()
min_idx = data['percentage'].idxmin()
bars[max_idx].set_color('red')
bars[min_idx].set_color('green')
plt.tight_layout()
plt.show()
# Print statistical summary
print("\n" + "="*60)
print("PROBABILITY DISTRIBUTION ANALYSIS SUMMARY")
print("="*60)
print(f"Mean: {data['percentage'].mean():.2f}%")
print(f"Median: {data['percentage'].median():.2f}%")
print(f"Mode: {data['percentage'].mode().values[0]:.2f}%")
print(f"Standard Deviation: {data['percentage'].std():.2f}%")
print(f"Variance: {data['percentage'].var():.2f}")
print(f"Range: {data['percentage'].max() - data['percentage'].min():.2f}%")
print(f"25th Percentile: {data['percentage'].quantile(0.25):.2f}%")
print(f"75th Percentile: {data['percentage'].quantile(0.75):.2f}%")
print(f"IQR: {data['percentage'].quantile(0.75) - data['percentage'].quantile(0.25):.2f}%")
print(f"Skewness: {skewness:.3f}")
print(f"Kurtosis: {kurtosis:.3f}")
print("\nExtreme Values:")
print(f"Highest: {data.loc[max_idx, 'Dzongkhag']} - {data.loc[max_idx, 'percentage']}%")
print(f"Lowest: {data.loc[min_idx, 'Dzongkhag']} - {data.loc[min_idx, 'percentage']}%")
# Check for normality using Shapiro-Wilk test
if len(data) < 5000: # Shapiro-Wilk is only reliable for n < 5000
stat, p_value = stats.shapiro(data['percentage'])
print(f"\nNormality Test (Shapiro-Wilk):")
print(f" Test Statistic: {stat:.4f}")
print(f" p-value: {p_value:.4f}")
if p_value > 0.05:
print(" The data appears to be normally distributed (p > 0.05)")
else:
print(" The data does not appear to be normally distributed (p ≤ 0.05)")
Dataset Information: <class 'pandas.core.frame.DataFrame'> RangeIndex: 20 entries, 0 to 19 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Dzongkhag 20 non-null object 1 percentage 20 non-null float64 dtypes: float64(1), object(1) memory usage: 452.0+ bytes None First 5 rows: Dzongkhag percentage 0 bumthang 21.8 1 chukha 30.7 2 dagana 31.3 3 gasa 23.8 4 haa 27.9 Descriptive Statistics: count 20.000000 mean 33.645000 std 7.384798 min 21.800000 25% 29.675000 50% 33.550000 75% 37.100000 max 50.600000 Name: percentage, dtype: float64
============================================================ PROBABILITY DISTRIBUTION ANALYSIS SUMMARY ============================================================ Mean: 33.64% Median: 33.55% Mode: 21.80% Standard Deviation: 7.38% Variance: 54.54 Range: 28.80% 25th Percentile: 29.68% 75th Percentile: 37.10% IQR: 7.43% Skewness: 0.433 Kurtosis: 0.346 Extreme Values: Highest: lhuentse - 50.6% Lowest: bumthang - 21.8% Normality Test (Shapiro-Wilk): Test Statistic: 0.9667 p-value: 0.6838 The data appears to be normally distributed (p > 0.05)
In [ ]: