import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import re

# Load CSV
df = pd.read_csv("datasets/ALD_Data_Big.csv")  # replace with your file path

# Keep only rows related to deaths
death_rows = df[df["Metric"].str.contains("deaths", case=False, na=False)]

# Extract numeric year (ignore ranges or text)
def extract_year(x):
    match = re.findall(r'\d{4}', str(x))
    return int(match[0]) if match else None

death_rows["Year_Num"] = death_rows["Year"].apply(extract_year)

# Convert Value to numeric (non-convertible become NaN)
death_rows["Death_Value"] = pd.to_numeric(death_rows["Value"], errors="coerce")

# Remove rows with NaN in year or death value
death_rows = death_rows.dropna(subset=["Year_Num", "Death_Value"])

# Group by year and average duplicates
clean_data = death_rows.groupby("Year_Num")["Death_Value"].mean().reset_index()
clean_data = clean_data.rename(columns={"Year_Num": "Year", "Death_Value": "ALD_Deaths"})

print("Cleaned Dataset:")
print(clean_data)

# Prepare features and target
X = clean_data[["Year"]]
y = clean_data["ALD_Deaths"]

# Train Linear Regression model
model = LinearRegression()
model.fit(X, y)

# Forecast for 2025–2027
future_years = np.array([2025, 2026, 2027]).reshape(-1, 1)
forecast = model.predict(future_years)

forecast_df = pd.DataFrame({
    "Year": [2025, 2026, 2027],
    "Predicted_ALD_Deaths": forecast.round(1)
})

print("\nForecasted ALD Deaths:")
print(forecast_df)

Cleaned Dataset:
   Year  ALD_Deaths
0  2016       189.2
1  2020       168.4
2  2021       138.8
3  2023       129.4

Forecasted ALD Deaths:
   Year  Predicted_ALD_Deaths
0  2025                 112.3
1  2026                 103.4
2  2027                  94.6

/tmp/ipykernel_6696/4000328202.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_rows["Year_Num"] = death_rows["Year"].apply(extract_year)
/tmp/ipykernel_6696/4000328202.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_rows["Death_Value"] = pd.to_numeric(death_rows["Value"], errors="coerce")
/opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import re

# Step 1: Load CSV
df = pd.read_csv("datasets/ALD_Data_Big.csv")  # replace with your CSV file path

# Step 2: Filter only rows related to deaths
death_rows = df[df["Metric"].str.contains("deaths", case=False, na=False)]

# Step 3: Extract numeric year
def extract_year(x):
    match = re.findall(r'\d{4}', str(x))
    return int(match[0]) if match else None

death_rows["Year_Num"] = death_rows["Year"].apply(extract_year)

# Step 4: Convert Value to numeric
death_rows["Death_Value"] = pd.to_numeric(death_rows["Value"], errors="coerce")

# Step 5: Remove rows with NaN in year or value
death_rows = death_rows.dropna(subset=["Year_Num", "Death_Value"])

# Step 6: Group by year and average duplicates
clean_data = death_rows.groupby("Year_Num")["Death_Value"].mean().reset_index()
clean_data = clean_data.rename(columns={"Year_Num": "Year", "Death_Value": "ALD_Deaths"})

# Step 7: Prepare data for model
X = clean_data[["Year"]]
y = clean_data["ALD_Deaths"]

# Step 8: Train Linear Regression model
model = LinearRegression()
model.fit(X, y)

# Step 9: Forecast future years
future_years = np.array([2025, 2026, 2027]).reshape(-1, 1)
forecast = model.predict(future_years)

# Combine historical + forecast for plotting
plot_years = np.concatenate([X["Year"].values, future_years.flatten()])
plot_values = np.concatenate([y.values, forecast])

# Step 10: Plot
plt.figure(figsize=(10,6))
plt.scatter(X["Year"], y, color='blue', label="Historical ALD Deaths")
plt.plot(plot_years, model.predict(plot_years.reshape(-1,1)), color='red', label="Linear Fit & Forecast")
plt.scatter(future_years, forecast, color='green', label="Forecast (2025-2027)", marker='x', s=100)
plt.xlabel("Year")
plt.ylabel("ALD Deaths")
plt.title("ALD Deaths Trend and Forecast (Linear Regression)")
plt.legend()
plt.grid(True)
plt.show()

# Step 11: Show forecast table
forecast_df = pd.DataFrame({
    "Year": [2025, 2026, 2027],
    "Predicted_ALD_Deaths": forecast.round(1)
})

print("Forecasted ALD Deaths:")
print(forecast_df)

/tmp/ipykernel_6696/700597134.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_rows["Year_Num"] = death_rows["Year"].apply(extract_year)
/tmp/ipykernel_6696/700597134.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  death_rows["Death_Value"] = pd.to_numeric(death_rows["Value"], errors="coerce")
/opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(
/opt/conda/lib/python3.13/site-packages/sklearn/utils/validation.py:2749: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(

Forecasted ALD Deaths:
   Year  Predicted_ALD_Deaths
0  2025                 112.3
1  2026                 103.4
2  2027                  94.6

Week 4: Machine Learning(28 November 2025)¶

Assignments: We are asked to Fit a machine learning model to our datasets¶

Introduction to the Dataset¶

Assignments: Machine learning model to forecast death rates for the years 2025, 2026, and 2027.¶

Fit a machine learning model to dataset¶

Week 4: Machine Learning(28 November 2025)¶

Assignments: We are asked to Fit a machine learning model to our datasets¶

Compiled Dataset: Alcohol-Related Deaths / Burden in Bhutan¶

Introduction to the Dataset¶

Assignments: Machine learning model to forecast death rates for the years 2025, 2026, and 2027.¶

Fit a machine learning model to dataset¶