import pandas as pd

df = pd.read_csv("datasets/Mortality cases.csv")
df.head(5)

df.columns

Index(['Table 2.8: Mortality cases by diseases for last 5 years, Bhutan',
       'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5',
       'Unnamed: 6'],
      dtype='object')

file_path = "datasets/Mortality cases.csv"

df_raw = pd.read_csv(file_path, header=None)   # load without assuming a header
df_raw.head(5)

header_row = 2 # Choosing this header as it contains the years ... not sure if the name of the disease and the ICD10 Code will be accounted for? 

df = pd.read_csv(file_path, header=header_row)

df.head()

df = pd.read_csv("datasets/Mortality cases3.csv")
df.head(5)

file_path = "datasets/Mortality cases3.csv"

df_raw = pd.read_csv(file_path)  
df_raw.head(10)

df.columns

Index(['ICD10 CODE', 'Name of the Disease', 'Type of disease', '2018', '2019',
       '2020', '2021', '2022'],
      dtype='object')

years = ['2018', '2019', '2020', '2021', '2022']
df[years] = df[years].apply(pd.to_numeric, errors='coerce') # pd.to_numeric ensures that data is read as numbers

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[65], line 2
      1 years = ['2018', '2019', '2020', '2021', '2022']
----> 2 df[years] = df[years].apply(pd.to_numeric, errors='coerce') # pd.to_numeric ensures that data is read as numbers

File /opt/conda/lib/python3.13/site-packages/pandas/core/frame.py:4119, in DataFrame.__getitem__(self, key)
   4117     if is_iterator(key):
   4118         key = list(key)
-> 4119     indexer = self.columns._get_indexer_strict(key, "columns")[1]
   4121 # take() does not accept boolean indexers
   4122 if getattr(indexer, "dtype", None) == bool:

File /opt/conda/lib/python3.13/site-packages/pandas/core/indexes/base.py:6212, in Index._get_indexer_strict(self, key, axis_name)
   6209 else:
   6210     keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 6212 self._raise_if_missing(keyarr, indexer, axis_name)
   6214 keyarr = self.take(indexer)
   6215 if isinstance(key, Index):
   6216     # GH 42790 - Preserve name from an Index

File /opt/conda/lib/python3.13/site-packages/pandas/core/indexes/base.py:6261, in Index._raise_if_missing(self, key, indexer, axis_name)
   6259 if nmissing:
   6260     if nmissing == len(indexer):
-> 6261         raise KeyError(f"None of [{key}] are in the [{axis_name}]")
   6263     not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
   6264     raise KeyError(f"{not_found} not in index")

KeyError: "None of [Index(['2018', '2019', '2020', '2021', '2022'], dtype='object')] are in the [columns]"

df.columns

Index(['ICD10 CODE', 'Name of the Disease', 'Year', 'Deaths'], dtype='object')

import plotly.express as px

df_long = df.melt(
    id_vars=['ICD10 CODE', 'Name of the Disease', 'Type of disease'],
    value_vars=years,
    var_name='Year',
    value_name='DeathCount'
)

df_long.head()

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[63], line 1
----> 1 df_long = df.melt(
      2     id_vars=['ICD10 CODE', 'Name of the Disease', 'Type of disease'],
      3     value_vars=years,
      4     var_name='Year',
      5     value_name='DeathCount'
      6 )
      8 df_long.head()

File /opt/conda/lib/python3.13/site-packages/pandas/core/frame.py:9969, in DataFrame.melt(self, id_vars, value_vars, var_name, value_name, col_level, ignore_index)
   9959 @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"})
   9960 def melt(
   9961     self,
   (...)   9967     ignore_index: bool = True,
   9968 ) -> DataFrame:
-> 9969     return melt(
   9970         self,
   9971         id_vars=id_vars,
   9972         value_vars=value_vars,
   9973         var_name=var_name,
   9974         value_name=value_name,
   9975         col_level=col_level,
   9976         ignore_index=ignore_index,
   9977     ).__finalize__(self, method="melt")

File /opt/conda/lib/python3.13/site-packages/pandas/core/reshape/melt.py:74, in melt(frame, id_vars, value_vars, var_name, value_name, col_level, ignore_index)
     70 if missing.any():
     71     missing_labels = [
     72         lab for lab, not_found in zip(labels, missing) if not_found
     73     ]
---> 74     raise KeyError(
     75         "The following id_vars or value_vars are not present in "
     76         f"the DataFrame: {missing_labels}"
     77     )
     78 if value_vars_was_not_none:
     79     frame = frame.iloc[:, algos.unique(idx)]

KeyError: "The following id_vars or value_vars are not present in the DataFrame: ['Type of disease', '2018', '2019', '2020', '2021', '2022']"

print(df.columns.tolist())

['ICD10 CODE', 'Name of the Disease', 'Year', 'Deaths']

df.columns = df.columns.str.strip()

df = df_long
fig = px.scatter(df_long, x="Year", y="Deaths", color="Name of the Disease")
fig.show()

df_long.head()

fig = px.scatter(
    df_long,
    x="Year",
    y="Deaths",
    color="Name of the Disease",
    hover_name="Name of the Disease",
    title="Mortality Cases by Disease (Bhutan, 2018–2022)"
)

fig.show()

	Table 2.8: Mortality cases by diseases for last 5 years, Bhutan	Unnamed: 1	Unnamed: 2	Unnamed: 3	Unnamed: 4	Unnamed: 5	Unnamed: 6
0	ICD10 CODE	Name of the Disease	Year	NaN	NaN	NaN	NaN
1	NaN	NaN	2018	2019.0	2020.0	2021.0	2022.0
2	Infectious Diseases	NaN	NaN	NaN	NaN	NaN	NaN
3	A02ᴳ	Diarrhoea	6	6.0	2.0	NaN	2.0
4	A03ᴳ	Dysentery	NaN	NaN	NaN	NaN	NaN

	0	1	2	3	4	5	6
0	Table 2.8: Mortality cases by diseases for las...	NaN	NaN	NaN	NaN	NaN	NaN
1	ICD10 CODE	Name of the Disease	Year	NaN	NaN	NaN	NaN
2	NaN	NaN	2018	2019.0	2020.0	2021.0	2022.0
3	Infectious Diseases	NaN	NaN	NaN	NaN	NaN	NaN
4	A02ᴳ	Diarrhoea	6	6.0	2.0	NaN	2.0

	Unnamed: 0	Unnamed: 1	2018	2019	2020	2021	2022
0	Infectious Diseases	NaN	NaN	NaN	NaN	NaN	NaN
1	A02ᴳ	Diarrhoea	6.0	6.0	2.0	NaN	2.0
2	A03ᴳ	Dysentery	NaN	NaN	NaN	NaN	NaN
3	A15ᴳ	Tuberculosis	22.0	20.0	20.0	31.0	17.0
4	A41ᴳ	Other Sepsis, including Septicaemia	62.0	46.0	52.0	32.0	45.0

	ICD10 CODE	Name of the Disease	Type of disease	2018	2019	2020	2021	2022
0	A02ᴳ	Diarrhoea	Infectious	6.0	6.0	2.0	NaN	2.0
1	A03ᴳ	Dysentery	Infectious	NaN	NaN	NaN	NaN	NaN
2	A15ᴳ	Tuberculosis	Infectious	22.0	20.0	20.0	31.0	17.0
3	A41ᴳ	Other Sepsis, including Septicaemia	Infectious	62.0	46.0	52.0	32.0	45.0
4	A50	Congenital Syphilis	Infectious	NaN	NaN	NaN	NaN	NaN

	ICD10 CODE	Name of the Disease	Type of disease	2018	2019	2020	2021	2022
0	A02ᴳ	Diarrhoea	Infectious	6.0	6.0	2.0	NaN	2.0
1	A03ᴳ	Dysentery	Infectious	NaN	NaN	NaN	NaN	NaN
2	A15ᴳ	Tuberculosis	Infectious	22.0	20.0	20.0	31.0	17.0
3	A41ᴳ	Other Sepsis, including Septicaemia	Infectious	62.0	46.0	52.0	32.0	45.0
4	A50	Congenital Syphilis	Infectious	NaN	NaN	NaN	NaN	NaN
5	A51ᴳ	Genital Ulcer	Infectious	NaN	NaN	NaN	NaN	NaN
6	A53ᴳ	Syphilis	Infectious	NaN	NaN	NaN	NaN	NaN
7	A54ᴳ	Urethral/Vaginal Discharges	Infectious	NaN	NaN	NaN	NaN	NaN
8	A63ᴳ	Other STD excluding HIV/AIDS	Infectious	NaN	1.0	NaN	1.0	1.0
9	A77ᴳ	Rickettsial Disease	Infectious	2.0	2.0	2.0	1.0	2.0

A previous draft that I'm keeping for notes ... but it is full of errors and references to different data sets so I moved to a completely new notebook¶

Obtaining the data¶

Clearing up the data¶

Data Analysis¶