import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("datasets/Final_report_tables_2021AS.csv", index_col=0)

# Replace dashes and whitespace-only entries with NaN
df.replace([' -   ', ' -', '-', ''], np.nan, inplace=True)

# Clean column names (strip extra spaces)
df.columns = df.columns.str.strip()

# Display first few rows
df.head()

# Get column names and try to infer which are production columns
cols = df.columns.tolist()

# Every 3rd column starting from index 2 appears to be "Production (MT)"
prod_indices = list(range(2, len(cols), 3))
prod_cols = [cols[i] for i in prod_indices]

# Create a new dataframe with only production values
prod_df = df[prod_cols].copy()

# Clean and convert to numeric
def clean_number(x):
    if pd.isna(x):
        return np.nan
    # Remove commas and convert
    try:
        return float(str(x).replace(',', ''))
    except:
        return np.nan

prod_df = prod_df.map(clean_number)

# Assign meaningful column names (fruit names)
fruit_names = [cols[i-2] for i in prod_indices]  # fruit name is 2 columns before production
prod_df.columns = fruit_names

# Show result
prod_df.head()

prod_df.count().sort_values(ascending=False)
# Select Apple and Banana production
data = prod_df[['Apple', 'Banana']].dropna()
data.shape

(14, 2)

from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Ellipse

# Data
X = data.values

# Fit GMM
gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=0)
gmm.fit(X)

# Predictions
labels = gmm.predict(X)

# Plot scatter
plt.figure(figsize=(8,6))
plt.scatter(X[:,0], X[:,1], c=labels, cmap='viridis', s=100, edgecolor='k', zorder=3)

# Draw ellipses (fixed alpha)
def draw_ellipse(pos, cov, ax=None, **kwargs):
    ax = ax or plt.gca()
    U, s, Vt = np.linalg.svd(cov)
    angle = np.degrees(np.arctan2(U[1,0], U[0,0]))
    width, height = 2 * np.sqrt(s)
    e = Ellipse(xy=pos, width=width, height=height, angle=angle, **kwargs)
    ax.add_patch(e)

for pos, cov in zip(gmm.means_, gmm.covariances_):
    draw_ellipse(pos, cov, edgecolor='red', fc='none', lw=2, zorder=2)

plt.xlabel('Apple Production (MT)')
plt.ylabel('Banana Production (MT)')
plt.title('Gaussian Mixture Model (2 Components) – Fruit Production by Dzongkhag')
plt.grid(True)
plt.tight_layout()
plt.show()

# Create a grid
x_min, x_max = X[:,0].min() - 5, X[:,0].max() + 5
y_min, y_max = X[:,1].min() - 5, X[:,1].max() + 5
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                     np.linspace(y_min, y_max, 100))
grid_points = np.c_[xx.ravel(), yy.ravel()]

# Evaluate log-density
log_density = gmm.score_samples(grid_points)
density = np.exp(log_density)

# Reshape for plotting
zz = density.reshape(xx.shape)

# Plot density contour + data
plt.figure(figsize=(8,6))
contour = plt.contour(xx, yy, zz, levels=10, cmap='Blues')
plt.clabel(contour, inline=True, fontsize=8)
plt.scatter(X[:,0], X[:,1], c='red', s=80, edgecolor='k', label='Dzongkhags')
plt.xlabel('Apple Production (MT)')
plt.ylabel('Banana Production (MT)')
plt.title('Estimated Joint Probability Density (GMM)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

	Apple	Unnamed: 2	Unnamed: 3	Areca nut	Unnamed: 5	Unnamed: 6	Mandarin	Unnamed: 8	Unnamed: 9	Watermelon	...	Unnamed: 66	Papaya	Unnamed: 68	Unnamed: 69	Pineapple	Unnamed: 71	Unnamed: 72	Passion fruit	Unnamed: 74	Unnamed: 75
Dzongkhag	Total Tree	Bearing Tree	Production (MT)	Total Tree	Bearing Tree	Production (MT)	Total Tree	Bearing Tree	Production (MT)	Sown Area (Acre)	...	Production (MT)	Total Tree	Bearing Tree	Production (MT)	Total Tree	Bearing Tree	Production (MT)	Total Tree	Bearing Tree	Production (MT)
Bumthang	5,939	2,182	57.12	NaN	NaN	NaN	NaN	NaN	NaN	0.06	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Chukha	4,782	1,492	36.8	2,29,264	83,737	700.41	1,14,184	49,171	1,353.37	0.12	...	14.38	72	56	1.98	6,316	2,680	2.71	239	189	3.69
Dagana	258	18	0.16	4,30,893	1,65,405	1,111.14	2,25,820	1,31,028	2,791.97	6.18	...	42.77	1,989	1,363	38.9	40,830	15,648	26.89	415	310	5.77
Gasa	6	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.01	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Apple	Areca nut	Mandarin	Watermelon	Dragon fruit	Kiwi	Pear	Peach	Plum	Apricot	...	Guava	Pomegranate	Avacado	Litchi	Jack fruit	Banana	Tree Tomato	Papaya	Pineapple	Passion fruit
Dzongkhag	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Bumthang	57.12	NaN	NaN	0.15	NaN	0.02	6.87	4.53	6.51	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Chukha	36.80	700.41	1353.37	0.17	NaN	40.69	43.83	39.55	10.01	4.84	...	39.53	0.44	1.42	24.01	7.02	207.60	14.38	1.98	2.71	3.69
Dagana	0.16	1111.14	2791.97	4.13	0.01	0.95	186.63	42.75	41.87	10.46	...	149.27	13.87	5.32	38.94	72.38	521.65	42.77	38.90	26.89	5.77
Gasa	NaN	NaN	NaN	0.02	NaN	NaN	1.16	0.97	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

Lesson 6: Density Estimation¶