[Your-Name-Here] - Fab Futures - Data Science
Home About
In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import warnings
warnings.filterwarnings('ignore')

# Read the CSV file
df = pd.read_csv('datasets/data.csv')

# Let's check what columns we actually have
print("Columns in the DataFrame:", df.columns.tolist())
print("\nFirst few rows:")
print(df.head())

# If the column names are problematic, let's rename them
if len(df.columns) == 2:
    # Assuming first column is Dzongkhag and second is percentage
    df.columns = ['Dzongkhag', 'percentage']
elif 'Dzongkhag' in df.columns and 'percentage' not in df.columns:
    # Check which column contains the percentage data
    for col in df.columns:
        if col != 'Dzongkhag':
            df = df.rename(columns={col: 'percentage'})
            break

# Clean the percentage column if needed (remove any non-numeric characters)
df['percentage'] = pd.to_numeric(df['percentage'].astype(str).str.replace('%', ''), errors='coerce')

# Sort by percentage for better visualization
df = df.sort_values('percentage').reset_index(drop=True)

# Prepare data
x = np.arange(len(df))  # Use indices as x values
y = df['percentage'].values
dzongkhags = df['Dzongkhag'].values

print(f"\nData prepared for fitting:")
print(f"Number of data points: {len(df)}")
print(f"X range: {x.min()} to {x.max()}")
print(f"Y range: {y.min():.1f} to {y.max():.1f}")

# Define different functions to fit
def linear_func(x, a, b):
    return a * x + b

def quadratic_func(x, a, b, c):
    return a * x**2 + b * x + c

def cubic_func(x, a, b, c, d):
    return a * x**3 + b * x**2 + c * x + d

def exponential_func(x, a, b, c):
    return a * np.exp(b * x) + c

def polynomial_4th(x, a, b, c, d, e):
    return a * x**4 + b * x**3 + c * x**2 + d * x + e

# Try fitting with different functions
functions = {
    'Linear': linear_func,
    'Quadratic': quadratic_func,
    'Cubic': cubic_func,
    '4th Degree': polynomial_4th,
    'Exponential': exponential_func
}

# Create figure
plt.figure(figsize=(16, 10))

# Plot original data
plt.plot(x, y, 'bo-', linewidth=3, markersize=10, label='Original Data', markerfacecolor='white', markeredgewidth=2)

# Try fitting with each function
x_fit = np.linspace(0, len(df)-1, 200)

best_fit = None
best_r2 = -np.inf
best_params = None
best_func_name = ''

colors = ['orange', 'green', 'purple', 'brown', 'gray']
color_idx = 0

for func_name, func in functions.items():
    try:
        # Provide initial parameter guesses based on function type
        if func_name == 'Exponential':
            p0 = [1, 0.01, np.mean(y)]
        elif func_name == 'Linear':
            p0 = [0.5, np.mean(y)]
        elif func_name == 'Quadratic':
            p0 = [0.01, 0.5, np.mean(y)]
        elif func_name == 'Cubic':
            p0 = [0.001, 0.01, 0.5, np.mean(y)]
        elif func_name == '4th Degree':
            p0 = [0.0001, 0.001, 0.01, 0.5, np.mean(y)]
        
        params, _ = curve_fit(func, x, y, p0=p0, maxfev=10000)
        y_pred = func(x_fit, *params)
        
        # Calculate R-squared
        y_pred_actual = func(x, *params)
        ss_res = np.sum((y - y_pred_actual) ** 2)
        ss_tot = np.sum((y - np.mean(y)) ** 2)
        r_squared = 1 - (ss_res / ss_tot)
        
        # Plot the fitted curve
        plt.plot(x_fit, y_pred, '--', linewidth=2, alpha=0.8, 
                 color=colors[color_idx],
                 label=f'{func_name} (R²={r_squared:.4f})')
        color_idx += 1
        
        # Track best fit
        if r_squared > best_r2:
            best_r2 = r_squared
            best_fit = y_pred
            best_params = params
            best_func_name = func_name
            
    except Exception as e:
        print(f"Could not fit {func_name} function: {str(e)[:100]}")

# Add best fit with thicker line
if best_fit is not None:
    plt.plot(x_fit, best_fit, 'r-', linewidth=4, alpha=0.9, 
             label=f'Best Fit: {best_func_name} (R²={best_r2:.4f})')

# Customize the plot
plt.title('Alcohol Consumption Percentage by Dzongkhag (Sorted)', 
          fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Dzongkhag (Sorted by Consumption)', fontsize=14)
plt.ylabel('Percentage of Alcohol Consumption', fontsize=14)

# Add dzongkhag names as x-tick labels (rotated for readability)
plt.xticks(x, dzongkhags, rotation=45, ha='right', fontsize=11)
plt.yticks(fontsize=11)

# Add horizontal grid
plt.grid(True, alpha=0.3, linestyle='--')

# Create two legends - one for data, one for fits
from matplotlib.lines import Line2D
legend_elements = [
    Line2D([0], [0], color='blue', marker='o', linestyle='-', 
           markersize=8, label='Actual Data', linewidth=3),
    Line2D([0], [0], color='red', linestyle='-', 
           linewidth=4, label=f'Best Fit: {best_func_name}'),
]

plt.legend(handles=legend_elements, loc='upper left', fontsize=11)

# Add data point labels with values
for i, (dz, perc) in enumerate(zip(dzongkhags, y)):
    plt.annotate(f'{perc:.1f}%', (x[i], y[i]), 
                 textcoords="offset points", 
                 xytext=(0,10), 
                 ha='center', 
                 fontsize=9,
                 bbox=dict(boxstyle="round,pad=0.2", facecolor="yellow", alpha=0.7))

# Add equation of best fit if available
if best_params is not None and best_func_name:
    plt.figtext(0.15, 0.02, 
                f"Best Fit Function: {best_func_name} | R² = {best_r2:.4f}", 
                fontsize=11, 
                bbox=dict(boxstyle="round,pad=0.5", facecolor="lightblue", alpha=0.8))

# Add statistics box
stats_text = f"Statistics:\n"
stats_text += f"Mean: {np.mean(y):.2f}%\n"
stats_text += f"Std Dev: {np.std(y):.2f}%\n"
stats_text += f"Min: {np.min(y):.2f}% ({dzongkhags[np.argmin(y)]})\n"
stats_text += f"Max: {np.max(y):.2f}% ({dzongkhags[np.argmax(y)]})"

plt.figtext(0.02, 0.98, stats_text, fontsize=10, 
            bbox=dict(boxstyle="round,pad=0.5", facecolor="lightgreen", alpha=0.8),
            verticalalignment='top')

plt.tight_layout(rect=[0, 0.05, 1, 0.95])
plt.show()

# Print detailed summary
print("\n" + "="*70)
print("FITTING RESULTS SUMMARY:")
print("="*70)

# Print all dzongkhags with their values
print("\nDzongkhags sorted by alcohol consumption percentage:")
for i, (dz, perc) in enumerate(zip(dzongkhags, y), 1):
    print(f"{i:2d}. {dz:20s}: {perc:5.1f}%")

print("\n" + "-"*70)

if best_func_name:
    print(f"BEST FIT FUNCTION: {best_func_name}")
    print(f"R-squared value: {best_r2:.6f}")
    print("\nFunction parameters:")
    for i, param in enumerate(best_params):
        print(f"  Parameter {i+1}: {param:.8f}")
    
    print("\nFunction equation:")
    if best_func_name == 'Linear':
        print(f"  y = {best_params[0]:.6f}x + {best_params[1]:.6f}")
    elif best_func_name == 'Quadratic':
        print(f"  y = {best_params[0]:.6f}x² + {best_params[1]:.6f}x + {best_params[2]:.6f}")
    elif best_func_name == 'Cubic':
        print(f"  y = {best_params[0]:.6f}x³ + {best_params[1]:.6f}x² + {best_params[2]:.6f}x + {best_params[3]:.6f}")
    elif best_func_name == '4th Degree':
        print(f"  y = {best_params[0]:.6f}x⁴ + {best_params[1]:.6f}x³ + {best_params[2]:.6f}x² + {best_params[3]:.6f}x + {best_params[4]:.6f}")
    elif best_func_name == 'Exponential':
        print(f"  y = {best_params[0]:.6f} * e^({best_params[1]:.6f}x) + {best_params[2]:.6f}")

print("="*70)
Columns in the DataFrame: ['percentage of alcohol consumption by dzongkhag wise', 'Unnamed: 1']

First few rows:
  percentage of alcohol consumption by dzongkhag wise  Unnamed: 1
0                                          Dzongkhag   percentage
1                                           bumthang         21.8
2                                             chukha         30.7
3                                             dagana         31.3
4                                               gasa         23.8

Data prepared for fitting:
Number of data points: 21
X range: 0 to 20
Y range: nan to nan
Could not fit Linear function: array must not contain infs or NaNs
Could not fit Quadratic function: array must not contain infs or NaNs
Could not fit Cubic function: array must not contain infs or NaNs
Could not fit 4th Degree function: array must not contain infs or NaNs
Could not fit Exponential function: array must not contain infs or NaNs
No description has been provided for this image
======================================================================
FITTING RESULTS SUMMARY:
======================================================================

Dzongkhags sorted by alcohol consumption percentage:
 1. bumthang            :  21.8%
 2. wangdue             :  22.0%
 3. gasa                :  23.8%
 4. haa                 :  27.9%
 5. mongar              :  28.7%
 6. samdrupjongkhar     :  30.0%
 7. chukha              :  30.7%
 8. dagana              :  31.3%
 9. samtse              :  32.5%
10. sarpang             :  33.2%
11. trashigang          :  33.9%
12. paro                :  34.3%
13. trongsa             :  34.5%
14. tsirang             :  34.9%
15. zhemgang            :  36.2%
16. thimphu             :  39.8%
17. trashiyangtse       :  40.0%
18. punakha             :  41.0%
19. pemagatshel         :  45.8%
20. lhuentse            :  50.6%
21. Dzongkhag           :   nan%

----------------------------------------------------------------------
======================================================================
In [ ]: