import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('datasets/ICRISAT-District Level Data.csv')

# Quick exploration
print(f"Shape: {df.shape}")
print(f"Unique states: {df['State Name'].nunique()}")
print(df['State Name'].value_counts().head())  # Top 5 states
print(df.head(2))  # First 2 rows

Shape: (16146, 80)
Unique states: 20
State Name
Uttar Pradesh     2392
Madhya Pradesh    1924
Rajasthan         1352
Maharashtra       1348
Karnataka          988
Name: count, dtype: int64
   Dist Code  Year  State Code    State Name Dist Name  RICE AREA (1000 ha)  \
0          1  1966          14  Chhattisgarh      Durg                548.0   
1          1  1967          14  Chhattisgarh      Durg                547.0   

   RICE PRODUCTION (1000 tons)  RICE YIELD (Kg per ha)  WHEAT AREA (1000 ha)  \
0                        185.0                  337.59                  44.0   
1                        409.0                  747.71                  50.0   

   WHEAT PRODUCTION (1000 tons)  ...  SUGARCANE YIELD (Kg per ha)  \
0                          20.0  ...                      1777.78   
1                          26.0  ...                      1500.00   

   COTTON AREA (1000 ha)  COTTON PRODUCTION (1000 tons)  \
0                    0.0                            0.0   
1                    0.0                            0.0   

   COTTON YIELD (Kg per ha)  FRUITS AREA (1000 ha)  VEGETABLES AREA (1000 ha)  \
0                       0.0                   5.95                       6.64   
1                       0.0                   5.77                       7.24   

   FRUITS AND VEGETABLES AREA (1000 ha)  POTATOES AREA (1000 ha)  \
0                                 12.59                     0.01   
1                                 13.02                     0.01   

   ONION AREA (1000 ha)  FODDER AREA (1000 ha)  
0                  0.60                   0.47  
1                  0.56                   1.23  

[2 rows x 80 columns]

# Replace -1 with NaN
df.replace(-1, np.nan, inplace=True)

# Define features (numerical columns) and target
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [col for col in num_cols if col not in ['Dist Code', 'Year', 'State Code']]
target_col = 'State Name'

X = df[feature_cols]
y = df[target_col]

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Impute missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

print(f"Features shape: {X_imputed.shape}")
print(f"Target classes: {le.classes_[:5]}...")  # First 5 classes

Features shape: (16146, 75)
Target classes: ['Andhra Pradesh' 'Assam' 'Bihar' 'Chhattisgarh' 'Gujarat']...

# Split (stratified for class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Train model
clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)
y_pred_labels = le.inverse_transform(y_pred)  # Back to state names
y_test_labels = le.inverse_transform(y_test)

print("Training complete!")

Training complete!

# Classification report
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion matrix (visualize top classes for brevity)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=False, cmap='Blues', xticklabels=le.classes_[::4], yticklabels=le.classes_[::4])  # Every 4th label
plt.title('Confusion Matrix (Subset Labels)')
plt.xlabel('Predicted State')
plt.ylabel('True State')
plt.show()

# Feature importance (top 10)
importances = pd.DataFrame({'feature': feature_cols, 'importance': clf.feature_importances_})
top_features = importances.sort_values('importance', ascending=False).head(10)
print(top_features)

# Plot top features
sns.barplot(data=top_features, x='importance', y='feature')
plt.title('Top 10 Feature Importances')
plt.show()

                  precision    recall  f1-score   support

  Andhra Pradesh       1.00      1.00      1.00       114
           Assam       1.00      0.99      1.00       104
           Bihar       1.00      1.00      1.00       114
    Chhattisgarh       1.00      1.00      1.00        62
         Gujarat       1.00      1.00      1.00       187
         Haryana       1.00      1.00      1.00        73
Himachal Pradesh       1.00      1.00      1.00       104
       Jharkhand       0.98      1.00      0.99        60
       Karnataka       1.00      1.00      1.00       198
          Kerala       0.99      1.00      1.00       103
  Madhya Pradesh       1.00      1.00      1.00       385
     Maharashtra       1.00      1.00      1.00       270
          Orissa       1.00      1.00      1.00       135
          Punjab       1.00      0.99      1.00       114
       Rajasthan       1.00      1.00      1.00       271
      Tamil Nadu       1.00      0.99      1.00       125
       Telangana       1.00      1.00      1.00        94
   Uttar Pradesh       1.00      1.00      1.00       479
     Uttarakhand       1.00      1.00      1.00        83
     West Bengal       1.00      1.00      1.00       155

        accuracy                           1.00      3230
       macro avg       1.00      1.00      1.00      3230
    weighted avg       1.00      1.00      1.00      3230

                                feature  importance
4          WHEAT PRODUCTION (1000 tons)    0.037195
3                  WHEAT AREA (1000 ha)    0.037189
0                   RICE AREA (1000 ha)    0.034349
72              POTATOES AREA (1000 ha)    0.030699
1           RICE PRODUCTION (1000 tons)    0.025169
64     SUGARCANE PRODUCTION (1000 tons)    0.023811
30             PIGEONPEA AREA (1000 ha)    0.023069
31     PIGEONPEA PRODUCTION (1000 tons)    0.022033
65          SUGARCANE YIELD (Kg per ha)    0.021959
16  PEARL MILLET PRODUCTION (1000 tons)    0.020814

Week 3 : Machine Learning¶

Using Scikit-Learn for Classification on the ICRISAT Dataset¶

Step1 : Loading Dataset¶

Step 2: Preprocess Data¶

Step 3: Split Data and Train Model¶

Explanation¶

Step 4: Evaluate Model¶