import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

# -------------------------------
# 1. Load Dataset
# -------------------------------
df = pd.read_csv("datasets/ALD_Data_Big.csv")   # <-- your CSV file

# -------------------------------
# 2. Clean and Extract Numeric Values
# -------------------------------
def extract_numeric(value):
    """
    Convert value to float if numeric.
    Ignore strings like '~140 (2012)' or '-0.26% decline'
    """
    try:
        return float(str(value).replace('%','').replace('−','-'))
    except:
        return None

df["CleanValue"] = df["Value"].apply(extract_numeric)

# Keep only positive numeric ALD death counts
clean_df = df[df["CleanValue"].notnull() & (df["CleanValue"] > 0)]

values = clean_df["CleanValue"].values

print("Cleaned numeric dataset:")
print(values)

# -------------------------------
# 3. Compute Probability Distribution (PMF)
# -------------------------------
unique_values, counts = np.unique(values, return_counts=True)
pmf = counts / counts.sum()

pmf_df = pd.DataFrame({
    "Value": unique_values,
    "Count": counts,
    "Probability": pmf
})

print("\nProbability Mass Function (PMF):")
print(pmf_df)

# -------------------------------
# 4. Plot Histogram (Probability Distribution)
# -------------------------------
plt.figure(figsize=(8,5))
plt.hist(values, bins=10, density=True, edgecolor='black')
plt.title("Probability Distribution of ALD Death Values")
plt.xlabel("ALD Death Counts")
plt.ylabel("Probability Density")
plt.grid(True)
plt.show()

# -------------------------------
# 5. Kernel Density Estimate (Smooth Distribution)
# -------------------------------
kde = gaussian_kde(values)
x = np.linspace(min(values), max(values), 200)
y = kde(x)

plt.figure(figsize=(8,5))
plt.plot(x, y)
plt.title("KDE – Smooth Probability Distribution")
plt.xlabel("ALD Death Counts")
plt.ylabel("Density")
plt.grid(True)
plt.show()

Cleaned numeric dataset:
[125.   142.   186.   171.   162.   191.    15.22 137.   134.     8.22
 169.    10.22 143.   169.   171.   136.   136.    15.22   8.22 131.
 133.   124.   188.   191.   190.  ]

Probability Mass Function (PMF):
     Value  Count  Probability
0     8.22      2         0.08
1    10.22      1         0.04
2    15.22      2         0.08
3   124.00      1         0.04
4   125.00      1         0.04
5   131.00      1         0.04
6   133.00      1         0.04
7   134.00      1         0.04
8   136.00      2         0.08
9   137.00      1         0.04
10  142.00      1         0.04
11  143.00      1         0.04
12  162.00      1         0.04
13  169.00      2         0.08
14  171.00      2         0.08
15  186.00      1         0.04
16  188.00      1         0.04
17  190.00      1         0.04
18  191.00      2         0.08

import pandas as pd
import matplotlib.pyplot as plt

# 1. Load dataset
df = pd.read_csv("datasets/ALD_Data_Big.csv")  # update with your file path

# 2. Clean numeric values
def extract_numeric(value):
    try:
        return float(str(value).replace('%','').replace('−','-'))
    except:
        return None

df["CleanValue"] = df["Value"].apply(extract_numeric)
df_clean = df[df["CleanValue"].notnull() & (df["CleanValue"] > 0)]

# 3. Get unique years
years = sorted(df_clean["Year"].unique())

# 4. Plot histograms year-wise
plt.figure(figsize=(12,6))

colors = plt.cm.tab10.colors  # 10 distinct colors
for i, year in enumerate(years):
    data = df_clean[df_clean["Year"] == year]["CleanValue"]
    plt.hist(data, bins=10, alpha=0.5, color=colors[i % len(colors)], label=str(year), density=True)

plt.xlabel("ALD Death Counts")
plt.ylabel("Probability Density")
plt.title("Year-wise Histogram of ALD Death Counts")
plt.legend()
plt.grid(True)
plt.show()

Week 5: Probability(02 December 2025)¶

Assignments: We are asked to Investigate the probability distribution of our datasets¶

Introduction to the Dataset¶

Probability Distribuation of Dataset¶

Explanation¶

Explanation¶

Week 5: Probability(02 December 2025)¶

Assignments: We are asked to Investigate the probability distribution of our datasets¶

Compiled Dataset: Alcohol-Related Deaths / Burden in Bhutan¶

Introduction to the Dataset¶

Probability Distribuation of Dataset¶

Explanation¶

Probability Distribution in related to Years¶

Explanation¶