import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import signal

# Read the CSV file
df = pd.read_csv('datasets/data.csv')
print("Data loaded successfully:")
print(df.head())
print("\n" + "="*50 + "\n")

# Convert percentage data to a time-series-like format
# We'll treat Dzongkhags as time points
percentages = df['percentage'].values

# Create a synthetic "time" axis
time = np.arange(len(percentages))

# Create a spectrogram using scipy
# Since we don't have actual temporal data, we'll create a synthetic signal
# by treating the percentages as amplitudes

# Create a longer synthetic signal by repeating/expanding the data
# This gives us more data points for a meaningful spectrogram
fs = 100  # Sampling frequency (arbitrary for visualization)
t = np.linspace(0, 1, len(percentages) * 10)
synthetic_signal = np.interp(t, np.linspace(0, 1, len(percentages)), percentages)

# Add some noise to make the spectrogram more interesting
np.random.seed(42)
synthetic_signal += np.random.normal(0, 0.5, len(synthetic_signal))

# Compute spectrogram
frequencies, times, Sxx = signal.spectrogram(
    synthetic_signal, 
    fs=fs,
    window='hann',
    nperseg=256,
    noverlap=128,
    scaling='density'
)

# Plot 1: Original bar chart
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
bars = plt.bar(df['Dzongkhag'], df['percentage'])
plt.title('Alcohol Consumption Percentage by Dzongkhag', fontsize=14, fontweight='bold')
plt.xlabel('Dzongkhag', fontsize=12)
plt.ylabel('Percentage (%)', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

# Color bars by percentage value
cmap = plt.cm.viridis
norm = plt.Normalize(df['percentage'].min(), df['percentage'].max())
for bar, value in zip(bars, df['percentage']):
    bar.set_color(cmap(norm(value)))

# Plot 2: Heatmap visualization (spectrogram-like)
plt.subplot(2, 2, 2)
heatmap_data = percentages.reshape(1, -1)  # Reshape to 2D for heatmap
plt.imshow(heatmap_data, aspect='auto', cmap='viridis', 
           extent=[0, len(percentages), 0, 1])
plt.colorbar(label='Percentage (%)')
plt.title('Heatmap View (Spectrogram-like)', fontsize=14, fontweight='bold')
plt.xlabel('Dzongkhag Index', fontsize=12)
plt.yticks([])

# Plot 3: Actual spectrogram of synthetic signal
plt.subplot(2, 1, 2)
plt.pcolormesh(times, frequencies, 10 * np.log10(Sxx), shading='gouraud', cmap='viridis')
plt.title('Spectrogram of Synthetic Signal Based on Alcohol Consumption Data', 
          fontsize=14, fontweight='bold')
plt.ylabel('Frequency [Hz]', fontsize=12)
plt.xlabel('Time [s]', fontsize=12)
plt.colorbar(label='Power/Frequency [dB/Hz]')
plt.tight_layout()

plt.show()

# Statistical analysis
print("Statistical Summary of Alcohol Consumption Data:")
print("="*50)
print(f"Mean percentage: {df['percentage'].mean():.2f}%")
print(f"Median percentage: {df['percentage'].median():.2f}%")
print(f"Standard deviation: {df['percentage'].std():.2f}%")
print(f"Minimum: {df['percentage'].min():.1f}% (Dzongkhag: {df.loc[df['percentage'].idxmin(), 'Dzongkhag']})")
print(f"Maximum: {df['percentage'].max():.1f}% (Dzongkhag: {df.loc[df['percentage'].idxmax(), 'Dzongkhag']})")
print("\nTop 5 Dzongkhags with highest alcohol consumption:")
top5 = df.nlargest(5, 'percentage')
for idx, row in top5.iterrows():
    print(f"  {row['Dzongkhag']}: {row['percentage']}%")

# Alternative visualization: Create a more meaningful frequency representation
print("\n" + "="*50)
print("\nCreating frequency distribution analysis...")

# Create histogram with density plot
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(df['percentage'], bins=10, edgecolor='black', alpha=0.7, density=True)
plt.title('Distribution of Alcohol Consumption', fontsize=14, fontweight='bold')
plt.xlabel('Percentage (%)', fontsize=12)
plt.ylabel('Density', fontsize=12)

# Add kernel density estimate
from scipy.stats import gaussian_kde
kde = gaussian_kde(df['percentage'])
x_range = np.linspace(df['percentage'].min(), df['percentage'].max(), 100)
plt.plot(x_range, kde(x_range), 'r-', linewidth=2)

# Create a sorted view for pattern recognition
plt.subplot(1, 2, 2)
sorted_df = df.sort_values('percentage')
plt.plot(range(len(sorted_df)), sorted_df['percentage'], 'o-', linewidth=2, markersize=8)
plt.title('Sorted Alcohol Consumption (Pattern Analysis)', fontsize=14, fontweight='bold')
plt.xlabel('Rank (sorted by percentage)', fontsize=12)
plt.ylabel('Percentage (%)', fontsize=12)
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Create a simple frequency domain representation using FFT
print("\n" + "="*50)
print("Frequency Domain Analysis (FFT) of the data:")

# Perform FFT on the percentage data
fft_values = np.fft.fft(percentages - np.mean(percentages))  # Remove DC component
fft_freq = np.fft.fftfreq(len(percentages))

# Plot FFT magnitude
plt.figure(figsize=(10, 6))
plt.plot(fft_freq[:len(percentages)//2], np.abs(fft_values[:len(percentages)//2]))
plt.title('Frequency Domain Representation (FFT Magnitude)', fontsize=14, fontweight='bold')
plt.xlabel('Frequency', fontsize=12)
plt.ylabel('Magnitude', fontsize=12)
plt.grid(alpha=0.3)
plt.show()

print("\nAnalysis complete!")

Data loaded successfully:
  Dzongkhag  percentage
0  bumthang        21.8
1    chukha        30.7
2    dagana        31.3
3      gasa        23.8
4       haa        27.9

==================================================

/tmp/ipykernel_15917/316118275.py:35: UserWarning: nperseg = 256 is greater than input length  = 200, using nperseg = 200
  frequencies, times, Sxx = signal.spectrogram(

Statistical Summary of Alcohol Consumption Data:
==================================================
Mean percentage: 33.64%
Median percentage: 33.55%
Standard deviation: 7.38%
Minimum: 21.8% (Dzongkhag: bumthang)
Maximum: 50.6% (Dzongkhag: lhuentse)

Top 5 Dzongkhags with highest alcohol consumption:
  lhuentse: 50.6%
  pemagatshel: 45.8%
  punakha: 41.0%
  trashiyangtse: 40.0%
  thimphu: 39.8%

==================================================

Creating frequency distribution analysis...

==================================================
Frequency Domain Analysis (FFT) of the data:

Analysis complete!