import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load raw data without assuming headers are clean
df_raw = pd.read_csv('datasets/Final_report_tables_2021AS.csv', header=None)

# Extract header rows
header1 = df_raw.iloc[0]  # Crop names
header2 = df_raw.iloc[1]  # Metrics
data_rows = df_raw.iloc[2:].reset_index(drop=True)

# Set index (Dzongkhag)
dzongkhags = data_rows.iloc[:, 0]
data_values = data_rows.iloc[:, 1:]

# Define crops explicitly from your file (in order, every 3 columns)
crops_ordered = [
    "Apple", "Areca nut", "Mandarin", "Watermelon", "Dragon fruit", "Kiwi", "Pear",
    "Peach", "Plum", "Apricot", "Persimmon", "Walnut", "Lemons and Lime",
    "Hazelnut", "Mango", "Guava", "Pomegranate", "Avacado", "Litchi",
    "Jack fruit", "Banana", "Tree Tomato", "Papaya", "Pineapple", "Passion fruit"
]

# Rebuild clean MultiIndex columns
metrics = ["Total Tree", "Bearing Tree", "Production (MT)"]
columns = []
for crop in crops_ordered:
    if crop == "Watermelon":
        # Watermelon uses area, but still 3 columns; we'll keep metric names generic
        columns.extend([(crop, "Sown Area (Acre)"), (crop, "Harvested Area (Acre)"), (crop, "Production (MT)")])
    else:
        columns.extend([(crop, m) for m in metrics])

# Assign columns
data_values.columns = pd.MultiIndex.from_tuples(columns, names=["Crop", "Metric"])

# Set Dzongkhag as index
data_values.index = dzongkhags

# Clean data: replace " -   " and commas
df_clean = data_values.replace(r'\s*-\s*', np.nan, regex=True)
df_clean = df_clean.replace(',', '', regex=True)

# Convert to numeric
df_numeric = df_clean.apply(pd.to_numeric, errors='coerce')

# Now extract all (Bearing Tree, Production) for non-Watermelon crops
tree_crops = [c for c in crops_ordered if c != "Watermelon"]

all_points = []

for crop in tree_crops:
    if crop in df_numeric.columns.get_level_values(0):
        sub = df_numeric[crop]
        if "Bearing Tree" in sub.columns and "Production (MT)" in sub.columns:
            bearing = sub["Bearing Tree"]
            prod = sub["Production (MT)"]
            # Combine and drop NaNs
            combined = pd.DataFrame({'x': bearing, 'y': prod})
            combined = combined.dropna()
            if not combined.empty:
                all_points.append(combined)

# Concatenate all
if all_points:
    all_data = pd.concat(all_points, ignore_index=True)
else:
    raise ValueError("No valid data found!")

x = all_data['x'].values
y = all_data['y'].values

# Fit linear model
coeffs = np.polyfit(x, y, deg=1)
a, b = coeffs

# Plot
x_fit = np.linspace(x.min(), x.max(), 200)
y_fit = a * x_fit + b

plt.figure(figsize=(10, 6))
plt.scatter(x, y, alpha=0.6, label='All crops (Bearing vs Production)')
plt.plot(x_fit, y_fit, color='red', linewidth=2,
         label=f'Linear fit: $P = {a:.5f}B + {b:.5f}$')
plt.xlabel('Bearing Trees (count)')
plt.ylabel('Production (MT)')
plt.title('Combined Linear Fit Across All Tree Crops (2021)')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

print(f"Fitted equation: Production (MT) = {a:.6f} × Bearing Trees + {b:.6f}")

Fitted equation: Production (MT) = 0.012735 × Bearing Trees + 12.315360

Lesson 3: Fitting a function to the Dataset¶