from sklearn.decomposition import PCA
import pandas as pd
import plotly.express as px

df = pd.read_csv("datasets/Mortality cases3.csv")

years = ['2018', '2019', '2020', '2021', '2022']
df['Type of disease'] = df['Type of disease'].str.strip()
df['Type of disease'] = df['Type of disease'].replace('Infectious', 'Infectious Diseases')
df_grouped = df.groupby('Type of disease')[years].sum()

# 1. RE-SHAPING THE DATA
# For PCA to work, data needs to be sorted that such that rows = diseases and columns = years
df_pivot = df_grouped.copy()

# 2. RUNNING PCA
# We want to squash the 5 years of history into 2 "Summary Dimensions"
pca = PCA(n_components=2)
components = pca.fit_transform(df_pivot)

# 3. CREATING A DATAFRAME FOR PLOTTING
pca_df = pd.DataFrame(data=components, columns=['PC1', 'PC2'])
pca_df['Disease Name'] = df_pivot.index

# 4. PLOT
fig = px.scatter(
    pca_df, x='PC1', y='PC2', 
    text='Disease Name',
    title="PCA: Which diseases behave similarly?",
    template="plotly_white"
)
fig.update_traces(textposition='top center')
fig.show()

# out of curiosity, I tried the same code again but with 3 components - ideally, this would create a 3D plot
from sklearn.decomposition import PCA
import pandas as pd
import plotly.express as px

# 1. RE-SHAPING THE DATA
# For PCA to work, data needs to be sorted that such that rows = diseases and columns = years
df_pivot = df_grouped.copy()

# 2. RUNNING PCA
pca = PCA(n_components=3)
components = pca.fit_transform(df_pivot)

# 3. CREATING A DATAFRAME FOR PLOTTING
pca_df = pd.DataFrame(data=components, columns=['PC1', 'PC2', 'PC3'])
pca_df['Disease Name'] = df_pivot.index

# 4. PLOT
# Realised a scatter plot wouldn't have worked for me -- had to look up what syntax (?) to use for a 3D plot
fig = px.scatter_3d(
    pca_df, x='PC1', y='PC2', z='PC3',
    text='Disease Name',
    title="PCA: Which diseases behave similarly?",
    template="plotly_white"
)
fig.update_traces(textposition='top center')

# Took this bottom part from previous code that needed similar formatting
fig.update_layout(
    height=800, 
    template="plotly_white",
    scene=dict(
        xaxis_title='PC1 (Main Trend)',
        yaxis_title='PC2 (Secondary Pattern)',
        zaxis_title='PC3 (Nuance)')
)
        
fig.show()

import plotly.express as px
from scipy import stats
import pandas as pd
import numpy as np

# 1. CALCULATE YOUR CUSTOM COMPONENTS
custom_data = []

for disease in df_grouped.index:
    y = df_grouped.loc[disease].values
    x = np.arange(len(y))
    
    # Calculate Slope (Trend) and Standard Error (Unpredictability)
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    
    # Classify the trend for color-coding
    if slope > 1:
        trend_label = "Rising"
    elif slope < -1:
        trend_label = "Falling"
    else:
        trend_label = "Flat/Stable"
        
    custom_data.append({
        'Disease Name': disease,
        'Trend Score (Slope)': slope,        # Component 1 & 2
        'Unpredictability (Error)': std_err, # Component 3
        'Category': trend_label,
        'Average Deaths': np.mean(y)         # Size of bubble
    })

df_custom = pd.DataFrame(custom_data)

# 2. PLOT THE RESULT
# X-Axis = Trend (Left is Falling, Right is Rising)
# Y-Axis = Unpredictability (Top is Chaos, Bottom is Stable)
fig = px.scatter(
    df_custom,
    x="Trend Score (Slope)",
    y="Unpredictability (Error)",
    color="Category",           # Color by Rising/Falling
    size="Average Deaths",      # Size by Magnitude
    text="Disease Name",
    title="Disease Classification: Rising, Falling, vs. Unpredictable",
    color_discrete_map={"Rising": "red", "Falling": "green", "Flat/Stable": "gray"}
)

# Add crosshairs to divide the quadrants
fig.add_vline(x=0, line_dash="dash", line_color="gray") # Zero Trend Line
fig.add_hline(y=df_custom['Unpredictability (Error)'].mean(), line_dash="dash", line_color="gray") # Average Noise Line

fig.update_traces(textposition='top center')
fig.update_layout(template="plotly_white", height=600)
fig.show()

Transforms¶

Understanding PCA¶

Data analysis using PCA¶

Extra: Non-PCA analysis¶