import pandas as pd
import numpy as np

# Load raw data
df_raw = pd.read_csv("datasets/Final_report_tables_2021AS.csv", header=1)
df_raw.head()

# Clean function: replace '-' with NaN and remove commas
def clean_value(x):
    if pd.isna(x) or x == '-' or x == ' -   ':
        return np.nan
    if isinstance(x, str):
        return float(x.replace(',', ''))
    return x

# Apply cleaning
df_clean = df_raw.copy()
df_clean.iloc[:, 1:] = df_clean.iloc[:, 1:].map(clean_value)

# Set Dzongkhag as index
df_clean.set_index('Dzongkhag', inplace=True)

# Convert all to float
df_clean = df_clean.astype(float)

df_clean.head()

# Drop columns that are all NaN
df_clean = df_clean.dropna(axis=1, how='all')

# Standardize: (X - mean) / std
df_std = (df_clean - df_clean.mean()) / df_clean.std(ddof=0)  # population std

# Fill remaining NaNs with 0 (after standardization, missing = average)
df_std = df_std.fillna(0)

df_std.shape

(20, 75)

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Fit PCA
pca = PCA()
X_pca = pca.fit_transform(df_std)

# Explained variance
explained_var = pca.explained_variance_ratio_

# Plot
plt.figure(figsize=(10,5))
plt.plot(np.cumsum(explained_var), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA: Cumulative Explained Variance (Standardized Fruit Data)')
plt.grid(True)
plt.show()

# Show top components
explained_var[:10].sum()

np.float64(0.9242988895782661)

# Get feature names
features = df_clean.columns

# Loadings for PC1 and PC2
loadings = pca.components_[:2].T  # shape: (75, 2)

# Create loading dataframe
loadings_df = pd.DataFrame(loadings, index=features, columns=['PC1', 'PC2'])

# Show top 10 absolute contributors to PC1
top_pc1 = loadings_df['PC1'].abs().sort_values(ascending=False).head(10)
top_pc2 = loadings_df['PC2'].abs().sort_values(ascending=False).head(10)

top_pc1, top_pc2

(Total Tree.19         0.189477
 Production (MT).24    0.185952
 Production (MT).20    0.185092
 Bearing Tree.23       0.184692
 Total Tree.14         0.184358
 Bearing Tree.14       0.183514
 Production (MT).8     0.180861
 Production (MT).15    0.179570
 Bearing Tree.19       0.178233
 Total Tree.15         0.176051
 Name: PC1, dtype: float64,
 Total Tree.5          0.269127
 Bearing Tree.5        0.262484
 Total Tree.6          0.213236
 Bearing Tree.6        0.209074
 Total Tree.7          0.192993
 Bearing Tree.18       0.192506
 Production (MT).19    0.186512
 Production (MT).7     0.180939
 Total Tree.9          0.180555
 Total Tree.17         0.177959
 Name: PC2, dtype: float64)

# Reload with header=0 to get fruit names row
df_header = pd.read_csv("datasets/Final_report_tables_2021AS.csv", header=0, nrows=1)

# The first row looks like:
df_header

# Define fruit list in order
fruits = [
    "Apple", "Areca nut", "Mandarin", "Watermelon", "Dragon fruit", "Kiwi", "Pear", "Peach", "Plum",
    "Apricot", "Persimmon", "Walnut", "Lemons and Lime", "Hazelnut", "Mango", "Guava", "Pomegranate",
    "Avacado", "Litchi", "Jack fruit", "Banana", "Tree Tomato", "Papaya", "Pineapple", "Passion fruit"
]

# Metric names per fruit (except Watermelon uses area)
tree_metrics = ["Total Tree", "Bearing Tree", "Production (MT)"]
watermelon_metrics = ["Sown Area (Acre)", "Harvested Area (Acre)", "Production (MT)"]

feature_labels = []
for i, fruit in enumerate(fruits):
    if fruit == "Watermelon":
        metrics = watermelon_metrics
    else:
        metrics = tree_metrics
    for m in metrics:
        feature_labels.append(f"{fruit} - {m}")

len(feature_labels)

75

# Assign semantic names to df_clean columns
df_clean.columns = feature_labels
df_std.columns = feature_labels

# Recompute PCA with labeled features
pca = PCA()
X_pca = pca.fit_transform(df_std)

loadings_df = pd.DataFrame(
    pca.components_[:2].T,
    index=feature_labels,
    columns=['PC1', 'PC2']
)

# Top contributors
top_pc1 = loadings_df['PC1'].abs().sort_values(ascending=False).head(10)
top_pc2 = loadings_df['PC2'].abs().sort_values(ascending=False).head(10)

print("Top features in PC1:")
print(top_pc1)
print("\nTop features in PC2:")
print(top_pc2)

Top features in PC1:
Banana - Total Tree                0.189477
Passion fruit - Production (MT)    0.185952
Banana - Production (MT)           0.185092
Passion fruit - Bearing Tree       0.184692
Guava - Total Tree                 0.184358
Guava - Bearing Tree               0.183514
Plum - Production (MT)             0.180861
Guava - Production (MT)            0.179570
Banana - Bearing Tree              0.178233
Pomegranate - Total Tree           0.176051
Name: PC1, dtype: float64

Top features in PC2:
Pear - Total Tree               0.269127
Pear - Bearing Tree             0.262484
Peach - Total Tree              0.213236
Peach - Bearing Tree            0.209074
Plum - Total Tree               0.192993
Jack fruit - Bearing Tree       0.192506
Jack fruit - Production (MT)    0.186512
Peach - Production (MT)         0.180939
Persimmon - Total Tree          0.180555
Litchi - Total Tree             0.177959
Name: PC2, dtype: float64

# Plot Dzongkhags in PC1-PC2 space
plt.figure(figsize=(10,8))
plt.scatter(X_pca[:,0], X_pca[:,1], s=100)

# Annotate Dzongkhag names
for i, dz in enumerate(df_clean.index):
    plt.text(X_pca[i,0]+0.02, X_pca[i,1]+0.02, dz, fontsize=9)

plt.xlabel('PC1: Tropical Fruit Intensity')
plt.ylabel('PC2: Temperate Orchard Intensity')
plt.title('Bhutan Dzongkhags in PCA Space (Fruit Production)')
plt.grid(True)
plt.axhline(0, color='gray', linewidth=0.5)
plt.axvline(0, color='gray', linewidth=0.5)
plt.show()

	Dzongkhag	Total Tree	Bearing Tree	Production (MT)	Total Tree.1	Bearing Tree.1	Production (MT).1	Total Tree.2	Bearing Tree.2	Production (MT).2	...	Production (MT).21	Total Tree.21	Bearing Tree.21	Production (MT).22	Total Tree.22	Bearing Tree.22	Production (MT).23	Total Tree.23	Bearing Tree.23	Production (MT).24
0	Bumthang	5,939	2,182	57.12	-	-	-	-	-	-	...	-	-	-	-	-	-	-	-	-	-
1	Chukha	4,782	1,492	36.8	2,29,264	83,737	700.41	1,14,184	49,171	1,353.37	...	14.38	72	56	1.98	6,316	2,680	2.71	239	189	3.69
2	Dagana	258	18	0.16	4,30,893	1,65,405	1,111.14	2,25,820	1,31,028	2,791.97	...	42.77	1,989	1,363	38.9	40,830	15,648	26.89	415	310	5.77
3	Gasa	6	-	-	-	-	-	-	-	-	...	-	-	-	-	-	-	-	-	-	-
4	Haa	13,099	7,075	84.94	-	-	-	5,661	1,527	19.45	...	3.33	-	-	-	99	10	0.01	126	112	1.22

	Total Tree	Bearing Tree	Production (MT)	Total Tree.1	Bearing Tree.1	Production (MT).1	Total Tree.2	Bearing Tree.2	Production (MT).2	Sown Area (Acre)	...	Production (MT).21	Total Tree.21	Bearing Tree.21	Production (MT).22	Total Tree.22	Bearing Tree.22	Production (MT).23	Total Tree.23	Bearing Tree.23	Production (MT).24
Dzongkhag
Bumthang	5939.0	2182.0	57.12	NaN	NaN	NaN	NaN	NaN	NaN	0.06	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Chukha	4782.0	1492.0	36.80	229264.0	83737.0	700.41	114184.0	49171.0	1353.37	0.12	...	14.38	72.0	56.0	1.98	6316.0	2680.0	2.71	239.0	189.0	3.69
Dagana	258.0	18.0	0.16	430893.0	165405.0	1111.14	225820.0	131028.0	2791.97	6.18	...	42.77	1989.0	1363.0	38.90	40830.0	15648.0	26.89	415.0	310.0	5.77
Gasa	6.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.01	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Haa	13099.0	7075.0	84.94	NaN	NaN	NaN	5661.0	1527.0	19.45	0.04	...	3.33	NaN	NaN	NaN	99.0	10.0	0.01	126.0	112.0	1.22

Lesson 7: Transform¶