import pandas as pd
import numpy as np

# As always, loading my file
df = pd.read_csv("datasets/Mortality cases3.csv")
df.head(5)

# Variable for grouping my data 
years = ['2018', '2019', '2020', '2021', '2022']

# Trimming whitespace:
df['Type of disease'] = df['Type of disease'].str.strip()

# Editing my data since some of the categories became confusing to work with:
df['Type of disease'] = df['Type of disease'].replace('Infectious', 'Infectious Diseases')

# New table! Grouping my data by the preferred category and summing the years for total deaths per category per year
df_grouped = df.groupby('Type of disease')[years].sum() # This is from Gemini

import plotly.graph_objects as go
from plotly.subplots import make_subplots

def create_diagnostic_plots(category_name):
    # --- PART 1: PREPARE THE DATA & MATH ---
    # Safety Check
    if category_name not in df_grouped.index:
        print(f"Error: {category_name} not found.")
        return

    # Get X (Years) and Y (Actual Deaths)
    y_actual = df_grouped.loc[category_name].values
    x_years = np.array([2018, 2019, 2020, 2021, 2022])

    # Fit Linear Model (y = mx + c)
    coef = np.polyfit(x_years, y_actual, 1)
    poly_fn = np.poly1d(coef)

    # Calculate Residuals (The "Error")
    # Residual = Reality - Prediction
    y_predicted = poly_fn(x_years)
    residuals = y_actual - y_predicted

    # --- PART 2: CREATE THE CANVAS ---
    # We create a figure with 3 separate slots (subplots)
    fig = make_subplots(
        rows=1, cols=3,
        subplot_titles=("1. Residual Scatter Plot", "2. Strip Plot", "3. Box Plot"),
        column_widths=[0.5, 0.25, 0.25] # Make the first plot wider
    )

    # --- PART 3: PLOT 1 - RESIDUAL SCATTER PLOT ---
    # Goal: See if errors change over time (Time Pattern)
    fig.add_trace(go.Scatter(
        x=x_years, y=residuals,
        mode='markers', 
        marker=dict(size=10, color='blue'),
        name='Residual'
    ), row=1, col=1)

    # Add a red "Zero Line" (The target)
    fig.add_trace(go.Scatter(
        x=[2018, 2022], y=[0, 0],
        mode='lines',
        line=dict(color='red', dash='dash'),
        name='Zero Line'
    ), row=1, col=1)

    # --- PART 4: PLOT 2 - STRIP PLOT ---
    # Goal: See the raw spread of errors (Clustering)
    # We use a Box plot hack: make the box invisible so only dots show
    fig.add_trace(go.Box(
        y=residuals,
        boxpoints='all',      # Show all dots
        jitter=0,             # Don't scatter them left/right
        pointpos=0,           # Put them right in the center
        fillcolor='rgba(0,0,0,0)', # Invisible Box!
        line=dict(color='rgba(0,0,0,0)'), # Invisible Lines!
        marker=dict(color='green', size=10),
        name='Strip'
    ), row=1, col=2)

    # --- PART 5: PLOT 3 - BOX PLOT ---
    # Goal: See the summary stats (Median, Outliers)
    fig.add_trace(go.Box(
        y=residuals,
        marker=dict(color='orange'),
        name='Distribution'
    ), row=1, col=3)

    # --- PART 6: POLISH & SHOW ---
    fig.update_layout(
        title=f"Uncertainty Diagnostics: {category_name}",
        showlegend=False, # Hide legend to keep it clean
        template="plotly_white"
    )
    fig.show()

# Run it
create_diagnostic_plots('Infectious Diseases')

import IPython.display 
from IPython.display import YouTubeVideo

id = 'uial-2girHQ'
YouTubeVideo(id=id, width=900, height=400)

import matplotlib.pyplot as plt
import scipy.stats as stats

pd.DataFrame(df_grouped).plot(kind="density",
                              figsize=(5,5),
                              xlim=(-1,11));

# 1. Get your data
y_actual = df_grouped.loc['Infectious Diseases'].values
x_years = np.array([2018, 2019, 2020, 2021, 2022])

# 2. Calculate the Trend Line (Linear Model)
coef = np.polyfit(x_years, y_actual, 1)
predicted = np.poly1d(coef)(x_years)

# 3. Calculate Residuals (The Errors)
residuals = y_actual - predicted

# 4. Calculate the Stats
res_mean = np.mean(residuals)  # The Average Error
res_std = np.std(residuals, ddof=1) # The Spread (Standard Deviation)

print(f"Mean of Residuals: {res_mean:.2f}")
print(f"Standard Deviation: {res_std:.2f}")

Mean of Residuals: 0.00
Standard Deviation: 4.85

import plotly.graph_objects as go
from scipy.stats import t
import numpy as np

# 1. DEFINING THE FUNCTION
def analyse_t_distribution(category_name):
    # Safety check, as always
    if category_name not in df_grouped.index:
        print("Error: Name not found")
        return

    # Geting my data
    y_actual = df_grouped.loc[category_name].values
    x_years = np.array([2018, 2019, 2020, 2021, 2022])

    # Fitting a Linear Model to get residuals
    coef = np.polyfit(x_years, y_actual, 1)
    predicted = np.poly1d(coef)(x_years)
    residuals = y_actual - predicted

    # Calculating the stats, as done 2 cells above
    res_mean = np.mean(residuals)
    res_std = np.std(residuals, ddof=1)
    
    print(f"--- Statistics for {category_name} ---")
    print(f"Mean: {res_mean:.2f}")
    print(f"Std Dev: {res_std:.2f}")

# 2. PLOTTING
def plot_t_distribution(category_name):
    if category_name not in df_grouped.index:
        print("Error: Name not found")
        return
         
    # Setting up the curve - drawing the curve from "Mean - 4 Sigmas" to "Mean + 4 Sigmas"
    x_curve = np.linspace(res_mean - 4*res_std, res_mean + 4*res_std, 100)
    
    # Calculating the Shape: df (Degrees of Freedom) = Number of Data Points - 1
    degrees_of_freedom = len(residuals) - 1 
    
    y_curve = t.pdf(x_curve, df=degrees_of_freedom, loc=res_mean, scale=res_std)
    
    # 3. Drawing the Plot
    fig = go.Figure()
    
    # A. The Histogram (My actual errors)
    fig.add_trace(go.Histogram(
        x=residuals, 
        histnorm='probability density', # Normalising height to match the curve
        name='Actual Residuals',
        marker_color='gray', opacity=0.5
    ))
    
    # B. The T-Distribution (The Math Model itself) -- still need to understand this!
    fig.add_trace(go.Scatter(
        x=x_curve, y=y_curve,
        mode='lines',
        name=f'T-Distribution (df={degrees_of_freedom})',
        line=dict(color='purple', width=3)
    ))
    
    # C. The Mean Line (Center)
    fig.add_trace(go.Scatter(
        x=[res_mean, res_mean], y=[0, max(y_curve)],
        mode='lines',
        line=dict(color='red', dash='dash'),
        name=f'Mean ({res_mean:.2f})'
    ))
    
    fig.update_layout(title=f"T-Distribution of Errors: {category_name}", template="plotly_white")
    fig.show()

# 3. CALL THE FUNCTIONS
analyse_t_distribution('Infectious Diseases')
plot_t_distribution('Infectious Diseases')

--- Statistics for Infectious Diseases ---
Mean: 0.00
Std Dev: 4.85

import plotly.graph_objects as go
from scipy.stats import norm, gaussian_kde
import numpy as np

def compare_fit_vs_density(category_name):
    if category_name not in df_grouped.index:
        return

    # 1. GET RESIDUALS
    y_actual = df_grouped.loc[category_name].values
    x_years = np.array([2018, 2019, 2020, 2021, 2022])
    coef = np.polyfit(x_years, y_actual, 1)
    residuals = y_actual - np.poly1d(coef)(x_years)

    # 2. FITTING (Parametric)
    # Assuming it's a Bell Curve, calculating and using Mean/SD 
    mu, std = norm.fit(residuals)
    x_range = np.linspace(min(residuals)-10, max(residuals)+10, 100)
    y_fit = norm.pdf(x_range, mu, std)

    # 3. METHOD B: DENSITY ESTIMATION (Non-Parametric / KDE)
    # Just smooth out the data points into a curve
    kde = gaussian_kde(residuals)
    y_density = kde(x_range)

    # 4. PLOT
    fig = go.Figure()

    # The Histogram (The Raw Data)
    fig.add_trace(go.Histogram(
        x=residuals, histnorm='probability density',
        name='Data Histogram', marker_color='lightgray', opacity=0.5
    ))

    # The Fitted Line (Red)
    fig.add_trace(go.Scatter(
        x=x_range, y=y_fit, mode='lines',
        name='Fitted Distribution (Assumption)',
        line=dict(color='red', width=3, dash='dash')
    ))

    # The Density Line (Green)
    fig.add_trace(go.Scatter(
        x=x_range, y=y_density, mode='lines',
        name='Density Estimation (The Reality)',
        line=dict(color='green', width=3)
    ))

    fig.update_layout(title=f"Fitting vs. Density Estimation: {category_name}")
    fig.show()

compare_fit_vs_density('Infectious Diseases')

residual_data = []

for category in df_grouped.index:
    # 1. Get Data for this specific disease
    y_actual = df_grouped.loc[category].values
    x_years = np.array([2018, 2019, 2020, 2021, 2022])
    
    # 2. Fit Linear Model
    coef = np.polyfit(x_years, y_actual, 1)
    predicted_y = np.poly1d(coef)(x_years)
    
    # 3. Calculate Residuals
    residuals = y_actual - predicted_y
    
    # 4. Store them
    # We add 5 rows (one for each year) to our list
    for i in range(len(residuals)):
        residual_data.append({
            'Disease Type': category,
            'Year': x_years[i],
            'Residual': residuals[i]
        })

# Convert the list into a clean Pandas DataFrame
all_residuals_df = pd.DataFrame(residual_data)

# Show the first few rows to verify
print("Success! Calculated residuals for all diseases.")
print(all_residuals_df.head())

Success! Calculated residuals for all diseases.
                                        Disease Type  Year  Residual
0  Certain Conditions Originating in the Perinata...  2018       1.2
1  Certain Conditions Originating in the Perinata...  2019      -5.0
2  Certain Conditions Originating in the Perinata...  2020     -10.2
3  Certain Conditions Originating in the Perinata...  2021      30.6
4  Certain Conditions Originating in the Perinata...  2022     -16.6

import plotly.express as px

# Create the Violin Plot
fig = px.violin(
    all_residuals_df, 
    y="Residual", 
    x="Disease Type", 
    box=True,       # Draws a box plot inside the violin
    points="all",   # Shows the actual dots
    hover_data=['Year'] # Lets you see which year had the big error
)

fig.update_layout(
    height=1200,  # Sets the height to 800 pixels (standard is ~450)
    width=1000,  # Optional: Makes it wider too if the text is squashed
    title="Uncertainty Comparison: Residuals Across All Diseases",
    yaxis_title="Residual (Error in Number of Deaths)",
    template="plotly_white"
)

fig.show()

fig = px.box(
    all_residuals_df, 
    x="Disease Type", 
    y="Residual",
    points="all", # Show the actual dots
    hover_data=['Year']
)

# Add a red line at 0 (The Target)
fig.add_hline(y=0, line_dash="dash", line_color="red", annotation_text="Perfect Prediction")

fig.update_layout(
    title="Uncertainty Analysis: Box Plot of Errors",
    yaxis_title="Error (Residual)",
    height=800, # Taller chart
    template="plotly_white"
)

fig.show()

	ICD10 CODE	Name of the Disease	Type of disease	2018	2019	2020	2021	2022
0	A02ᴳ	Diarrhoea	Infectious	6.0	6.0	2.0	NaN	2.0
1	A03ᴳ	Dysentery	Infectious	NaN	NaN	NaN	NaN	NaN
2	A15ᴳ	Tuberculosis	Infectious	22.0	20.0	20.0	31.0	17.0
3	A41ᴳ	Other Sepsis, including Septicaemia	Infectious	62.0	46.0	52.0	32.0	45.0
4	A50	Congenital Syphilis	Infectious	NaN	NaN	NaN	NaN	NaN

Probability¶

Investigating the probability distribution of my data¶

Response 1:¶

Response 2:¶

Starting the assignment:¶

Fitting a probability distribution to my data¶

Density Estimation¶

Note:¶