In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import signal
# Read the CSV file
df = pd.read_csv('datasets/data.csv')
print("Data loaded successfully:")
print(df.head())
print("\n" + "="*50 + "\n")
# Convert percentage data to a time-series-like format
# We'll treat Dzongkhags as time points
percentages = df['percentage'].values
# Create a synthetic "time" axis
time = np.arange(len(percentages))
# Create a spectrogram using scipy
# Since we don't have actual temporal data, we'll create a synthetic signal
# by treating the percentages as amplitudes
# Create a longer synthetic signal by repeating/expanding the data
# This gives us more data points for a meaningful spectrogram
fs = 100 # Sampling frequency (arbitrary for visualization)
t = np.linspace(0, 1, len(percentages) * 10)
synthetic_signal = np.interp(t, np.linspace(0, 1, len(percentages)), percentages)
# Add some noise to make the spectrogram more interesting
np.random.seed(42)
synthetic_signal += np.random.normal(0, 0.5, len(synthetic_signal))
# Compute spectrogram
frequencies, times, Sxx = signal.spectrogram(
synthetic_signal,
fs=fs,
window='hann',
nperseg=256,
noverlap=128,
scaling='density'
)
# Plot 1: Original bar chart
plt.figure(figsize=(15, 10))
plt.subplot(2, 2, 1)
bars = plt.bar(df['Dzongkhag'], df['percentage'])
plt.title('Alcohol Consumption Percentage by Dzongkhag', fontsize=14, fontweight='bold')
plt.xlabel('Dzongkhag', fontsize=12)
plt.ylabel('Percentage (%)', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
# Color bars by percentage value
cmap = plt.cm.viridis
norm = plt.Normalize(df['percentage'].min(), df['percentage'].max())
for bar, value in zip(bars, df['percentage']):
bar.set_color(cmap(norm(value)))
# Plot 2: Heatmap visualization (spectrogram-like)
plt.subplot(2, 2, 2)
heatmap_data = percentages.reshape(1, -1) # Reshape to 2D for heatmap
plt.imshow(heatmap_data, aspect='auto', cmap='viridis',
extent=[0, len(percentages), 0, 1])
plt.colorbar(label='Percentage (%)')
plt.title('Heatmap View (Spectrogram-like)', fontsize=14, fontweight='bold')
plt.xlabel('Dzongkhag Index', fontsize=12)
plt.yticks([])
# Plot 3: Actual spectrogram of synthetic signal
plt.subplot(2, 1, 2)
plt.pcolormesh(times, frequencies, 10 * np.log10(Sxx), shading='gouraud', cmap='viridis')
plt.title('Spectrogram of Synthetic Signal Based on Alcohol Consumption Data',
fontsize=14, fontweight='bold')
plt.ylabel('Frequency [Hz]', fontsize=12)
plt.xlabel('Time [s]', fontsize=12)
plt.colorbar(label='Power/Frequency [dB/Hz]')
plt.tight_layout()
plt.show()
# Statistical analysis
print("Statistical Summary of Alcohol Consumption Data:")
print("="*50)
print(f"Mean percentage: {df['percentage'].mean():.2f}%")
print(f"Median percentage: {df['percentage'].median():.2f}%")
print(f"Standard deviation: {df['percentage'].std():.2f}%")
print(f"Minimum: {df['percentage'].min():.1f}% (Dzongkhag: {df.loc[df['percentage'].idxmin(), 'Dzongkhag']})")
print(f"Maximum: {df['percentage'].max():.1f}% (Dzongkhag: {df.loc[df['percentage'].idxmax(), 'Dzongkhag']})")
print("\nTop 5 Dzongkhags with highest alcohol consumption:")
top5 = df.nlargest(5, 'percentage')
for idx, row in top5.iterrows():
print(f" {row['Dzongkhag']}: {row['percentage']}%")
# Alternative visualization: Create a more meaningful frequency representation
print("\n" + "="*50)
print("\nCreating frequency distribution analysis...")
# Create histogram with density plot
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(df['percentage'], bins=10, edgecolor='black', alpha=0.7, density=True)
plt.title('Distribution of Alcohol Consumption', fontsize=14, fontweight='bold')
plt.xlabel('Percentage (%)', fontsize=12)
plt.ylabel('Density', fontsize=12)
# Add kernel density estimate
from scipy.stats import gaussian_kde
kde = gaussian_kde(df['percentage'])
x_range = np.linspace(df['percentage'].min(), df['percentage'].max(), 100)
plt.plot(x_range, kde(x_range), 'r-', linewidth=2)
# Create a sorted view for pattern recognition
plt.subplot(1, 2, 2)
sorted_df = df.sort_values('percentage')
plt.plot(range(len(sorted_df)), sorted_df['percentage'], 'o-', linewidth=2, markersize=8)
plt.title('Sorted Alcohol Consumption (Pattern Analysis)', fontsize=14, fontweight='bold')
plt.xlabel('Rank (sorted by percentage)', fontsize=12)
plt.ylabel('Percentage (%)', fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
# Create a simple frequency domain representation using FFT
print("\n" + "="*50)
print("Frequency Domain Analysis (FFT) of the data:")
# Perform FFT on the percentage data
fft_values = np.fft.fft(percentages - np.mean(percentages)) # Remove DC component
fft_freq = np.fft.fftfreq(len(percentages))
# Plot FFT magnitude
plt.figure(figsize=(10, 6))
plt.plot(fft_freq[:len(percentages)//2], np.abs(fft_values[:len(percentages)//2]))
plt.title('Frequency Domain Representation (FFT Magnitude)', fontsize=14, fontweight='bold')
plt.xlabel('Frequency', fontsize=12)
plt.ylabel('Magnitude', fontsize=12)
plt.grid(alpha=0.3)
plt.show()
print("\nAnalysis complete!")
Data loaded successfully: Dzongkhag percentage 0 bumthang 21.8 1 chukha 30.7 2 dagana 31.3 3 gasa 23.8 4 haa 27.9 ==================================================
/tmp/ipykernel_15917/316118275.py:35: UserWarning: nperseg = 256 is greater than input length = 200, using nperseg = 200 frequencies, times, Sxx = signal.spectrogram(
Statistical Summary of Alcohol Consumption Data: ================================================== Mean percentage: 33.64% Median percentage: 33.55% Standard deviation: 7.38% Minimum: 21.8% (Dzongkhag: bumthang) Maximum: 50.6% (Dzongkhag: lhuentse) Top 5 Dzongkhags with highest alcohol consumption: lhuentse: 50.6% pemagatshel: 45.8% punakha: 41.0% trashiyangtse: 40.0% thimphu: 39.8% ================================================== Creating frequency distribution analysis...
================================================== Frequency Domain Analysis (FFT) of the data:
Analysis complete!
In [ ]: