In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import warnings
warnings.filterwarnings('ignore')
# Read the CSV file
df = pd.read_csv('datasets/data.csv')
# Let's check what columns we actually have
print("Columns in the DataFrame:", df.columns.tolist())
print("\nFirst few rows:")
print(df.head())
# If the column names are problematic, let's rename them
if len(df.columns) == 2:
# Assuming first column is Dzongkhag and second is percentage
df.columns = ['Dzongkhag', 'percentage']
elif 'Dzongkhag' in df.columns and 'percentage' not in df.columns:
# Check which column contains the percentage data
for col in df.columns:
if col != 'Dzongkhag':
df = df.rename(columns={col: 'percentage'})
break
# Clean the percentage column if needed (remove any non-numeric characters)
df['percentage'] = pd.to_numeric(df['percentage'].astype(str).str.replace('%', ''), errors='coerce')
# Sort by percentage for better visualization
df = df.sort_values('percentage').reset_index(drop=True)
# Prepare data
x = np.arange(len(df)) # Use indices as x values
y = df['percentage'].values
dzongkhags = df['Dzongkhag'].values
print(f"\nData prepared for fitting:")
print(f"Number of data points: {len(df)}")
print(f"X range: {x.min()} to {x.max()}")
print(f"Y range: {y.min():.1f} to {y.max():.1f}")
# Define different functions to fit
def linear_func(x, a, b):
return a * x + b
def quadratic_func(x, a, b, c):
return a * x**2 + b * x + c
def cubic_func(x, a, b, c, d):
return a * x**3 + b * x**2 + c * x + d
def exponential_func(x, a, b, c):
return a * np.exp(b * x) + c
def polynomial_4th(x, a, b, c, d, e):
return a * x**4 + b * x**3 + c * x**2 + d * x + e
# Try fitting with different functions
functions = {
'Linear': linear_func,
'Quadratic': quadratic_func,
'Cubic': cubic_func,
'4th Degree': polynomial_4th,
'Exponential': exponential_func
}
# Create figure
plt.figure(figsize=(16, 10))
# Plot original data
plt.plot(x, y, 'bo-', linewidth=3, markersize=10, label='Original Data', markerfacecolor='white', markeredgewidth=2)
# Try fitting with each function
x_fit = np.linspace(0, len(df)-1, 200)
best_fit = None
best_r2 = -np.inf
best_params = None
best_func_name = ''
colors = ['orange', 'green', 'purple', 'brown', 'gray']
color_idx = 0
for func_name, func in functions.items():
try:
# Provide initial parameter guesses based on function type
if func_name == 'Exponential':
p0 = [1, 0.01, np.mean(y)]
elif func_name == 'Linear':
p0 = [0.5, np.mean(y)]
elif func_name == 'Quadratic':
p0 = [0.01, 0.5, np.mean(y)]
elif func_name == 'Cubic':
p0 = [0.001, 0.01, 0.5, np.mean(y)]
elif func_name == '4th Degree':
p0 = [0.0001, 0.001, 0.01, 0.5, np.mean(y)]
params, _ = curve_fit(func, x, y, p0=p0, maxfev=10000)
y_pred = func(x_fit, *params)
# Calculate R-squared
y_pred_actual = func(x, *params)
ss_res = np.sum((y - y_pred_actual) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
r_squared = 1 - (ss_res / ss_tot)
# Plot the fitted curve
plt.plot(x_fit, y_pred, '--', linewidth=2, alpha=0.8,
color=colors[color_idx],
label=f'{func_name} (R²={r_squared:.4f})')
color_idx += 1
# Track best fit
if r_squared > best_r2:
best_r2 = r_squared
best_fit = y_pred
best_params = params
best_func_name = func_name
except Exception as e:
print(f"Could not fit {func_name} function: {str(e)[:100]}")
# Add best fit with thicker line
if best_fit is not None:
plt.plot(x_fit, best_fit, 'r-', linewidth=4, alpha=0.9,
label=f'Best Fit: {best_func_name} (R²={best_r2:.4f})')
# Customize the plot
plt.title('Alcohol Consumption Percentage by Dzongkhag (Sorted)',
fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Dzongkhag (Sorted by Consumption)', fontsize=14)
plt.ylabel('Percentage of Alcohol Consumption', fontsize=14)
# Add dzongkhag names as x-tick labels (rotated for readability)
plt.xticks(x, dzongkhags, rotation=45, ha='right', fontsize=11)
plt.yticks(fontsize=11)
# Add horizontal grid
plt.grid(True, alpha=0.3, linestyle='--')
# Create two legends - one for data, one for fits
from matplotlib.lines import Line2D
legend_elements = [
Line2D([0], [0], color='blue', marker='o', linestyle='-',
markersize=8, label='Actual Data', linewidth=3),
Line2D([0], [0], color='red', linestyle='-',
linewidth=4, label=f'Best Fit: {best_func_name}'),
]
plt.legend(handles=legend_elements, loc='upper left', fontsize=11)
# Add data point labels with values
for i, (dz, perc) in enumerate(zip(dzongkhags, y)):
plt.annotate(f'{perc:.1f}%', (x[i], y[i]),
textcoords="offset points",
xytext=(0,10),
ha='center',
fontsize=9,
bbox=dict(boxstyle="round,pad=0.2", facecolor="yellow", alpha=0.7))
# Add equation of best fit if available
if best_params is not None and best_func_name:
plt.figtext(0.15, 0.02,
f"Best Fit Function: {best_func_name} | R² = {best_r2:.4f}",
fontsize=11,
bbox=dict(boxstyle="round,pad=0.5", facecolor="lightblue", alpha=0.8))
# Add statistics box
stats_text = f"Statistics:\n"
stats_text += f"Mean: {np.mean(y):.2f}%\n"
stats_text += f"Std Dev: {np.std(y):.2f}%\n"
stats_text += f"Min: {np.min(y):.2f}% ({dzongkhags[np.argmin(y)]})\n"
stats_text += f"Max: {np.max(y):.2f}% ({dzongkhags[np.argmax(y)]})"
plt.figtext(0.02, 0.98, stats_text, fontsize=10,
bbox=dict(boxstyle="round,pad=0.5", facecolor="lightgreen", alpha=0.8),
verticalalignment='top')
plt.tight_layout(rect=[0, 0.05, 1, 0.95])
plt.show()
# Print detailed summary
print("\n" + "="*70)
print("FITTING RESULTS SUMMARY:")
print("="*70)
# Print all dzongkhags with their values
print("\nDzongkhags sorted by alcohol consumption percentage:")
for i, (dz, perc) in enumerate(zip(dzongkhags, y), 1):
print(f"{i:2d}. {dz:20s}: {perc:5.1f}%")
print("\n" + "-"*70)
if best_func_name:
print(f"BEST FIT FUNCTION: {best_func_name}")
print(f"R-squared value: {best_r2:.6f}")
print("\nFunction parameters:")
for i, param in enumerate(best_params):
print(f" Parameter {i+1}: {param:.8f}")
print("\nFunction equation:")
if best_func_name == 'Linear':
print(f" y = {best_params[0]:.6f}x + {best_params[1]:.6f}")
elif best_func_name == 'Quadratic':
print(f" y = {best_params[0]:.6f}x² + {best_params[1]:.6f}x + {best_params[2]:.6f}")
elif best_func_name == 'Cubic':
print(f" y = {best_params[0]:.6f}x³ + {best_params[1]:.6f}x² + {best_params[2]:.6f}x + {best_params[3]:.6f}")
elif best_func_name == '4th Degree':
print(f" y = {best_params[0]:.6f}x⁴ + {best_params[1]:.6f}x³ + {best_params[2]:.6f}x² + {best_params[3]:.6f}x + {best_params[4]:.6f}")
elif best_func_name == 'Exponential':
print(f" y = {best_params[0]:.6f} * e^({best_params[1]:.6f}x) + {best_params[2]:.6f}")
print("="*70)
Columns in the DataFrame: ['percentage of alcohol consumption by dzongkhag wise', 'Unnamed: 1'] First few rows: percentage of alcohol consumption by dzongkhag wise Unnamed: 1 0 Dzongkhag percentage 1 bumthang 21.8 2 chukha 30.7 3 dagana 31.3 4 gasa 23.8 Data prepared for fitting: Number of data points: 21 X range: 0 to 20 Y range: nan to nan Could not fit Linear function: array must not contain infs or NaNs Could not fit Quadratic function: array must not contain infs or NaNs Could not fit Cubic function: array must not contain infs or NaNs Could not fit 4th Degree function: array must not contain infs or NaNs Could not fit Exponential function: array must not contain infs or NaNs
====================================================================== FITTING RESULTS SUMMARY: ====================================================================== Dzongkhags sorted by alcohol consumption percentage: 1. bumthang : 21.8% 2. wangdue : 22.0% 3. gasa : 23.8% 4. haa : 27.9% 5. mongar : 28.7% 6. samdrupjongkhar : 30.0% 7. chukha : 30.7% 8. dagana : 31.3% 9. samtse : 32.5% 10. sarpang : 33.2% 11. trashigang : 33.9% 12. paro : 34.3% 13. trongsa : 34.5% 14. tsirang : 34.9% 15. zhemgang : 36.2% 16. thimphu : 39.8% 17. trashiyangtse : 40.0% 18. punakha : 41.0% 19. pemagatshel : 45.8% 20. lhuentse : 50.6% 21. Dzongkhag : nan% ---------------------------------------------------------------------- ======================================================================
In [ ]: