import pandas as pd
import numpy as np

# Load the file
df = pd.read_csv("datasets/DataSet_CommonDiseases.csv", header=1)  # header=1 skips first row (title)
df = df.dropna(how='all')  # remove empty rows
df = df.fillna(0)           # replace blanks with 0
df.head(3)  # show first 3 rows

diarrhoea_row = df[df.iloc[:, 0] == 'Diarrhoea']




# Extract just the numbers (skip disease name)
counts = diarrhoea_row.iloc[:, 1:].values.flatten().astype(int)

# Age groups & sex labels
age_groups = ['0-29 Days', '1-11 Months', '1-4 Years', '5-9 Years',
              '10-14 Years', '15-19 Years', '20-24 Years',
              '25-49 Years', '50-59 Years', '60+ Years']
sexes = ['M', 'F'] * len(age_groups)

# Make tidy table
simple_df = pd.DataFrame({
    'Age': age_groups * 2,
    'Sex': sexes,
    'Count': counts
})
simple_df

import matplotlib.pyplot as plt

plt.figure(figsize=(6,4))
plt.hist(simple_df['Count'], bins=10, color='skyblue', edgecolor='black')
plt.title("How common are different case counts for Diarrhoea?")
plt.xlabel("Number of cases")
plt.ylabel("How many age/sex groups have that count")
plt.show()

mean_val = simple_df['Count'].mean()
std_val  = simple_df['Count'].std()

print(f"Average cases per age/sex group: {mean_val:.1f}")
print(f"Standard deviation (how 'spread out' it is): {std_val:.1f}")
print(f"So typical range: {mean_val - std_val:.0f} to {mean_val + std_val:.0f}")

Average cases per age/sex group: 1454.2
Standard deviation (how 'spread out' it is): 902.1
So typical range: 552 to 2356

male_avg   = simple_df[simple_df['Sex'] == 'M']['Count'].mean()
female_avg = simple_df[simple_df['Sex'] == 'F']['Count'].mean()

print(f"Male avg: {male_avg:.1f}")
print(f"Female avg: {female_avg:.1f}")
print(f"Difference: {male_avg - female_avg:+.1f} (positive = more in males)")

Male avg: 1508.5
Female avg: 1400.0
Difference: +108.5 (positive = more in males)

# Group by age (sum M+F)
age_totals = simple_df.groupby('Age')['Count'].sum()

plt.figure(figsize=(8,4))
age_totals.plot(kind='bar', color='teal')
plt.title("Diarrhoea: Total cases by age group")
plt.ylabel("Total cases (M + F)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Week 5: Assignment ~ Exploring Probability Distributions¶

Step 1: Loading and Cleaning the Data¶

Step 2: Making a histogram¶

Step 3: Computing Basic Stats (Mean, Std Dev)¶

Step 5: Compare Male vs Female¶

Step 2: Displaying the data age-wise¶

	Age	Sex	Count
0	0-29 Days	M	68
1	1-11 Months	F	66
2	1-4 Years	M	928
3	5-9 Years	F	906
4	10-14 Years	M	3303
5	15-19 Years	F	2800
6	20-24 Years	M	2007
7	25-49 Years	F	1753
8	50-59 Years	M	1717
9	60+ Years	F	1378
10	0-29 Days	M	1230
11	1-11 Months	F	997
12	1-4 Years	M	991
13	5-9 Years	F	920
14	10-14 Years	M	2722
15	15-19 Years	F	2928
16	20-24 Years	M	801
17	25-49 Years	F	865
18	50-59 Years	M	1318
19	60+ Years	F	1387