< Home
Week 3: Assignment ~ Fitting a Function a to the data¶
In this assignment, I included a linear regression line (line of best fit) to model the relationship between disease incidence in males and females across age groups.
Linear Regression Line (line of best fit)¶
In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
# Load dataset — skip first 2 metadata lines
df = pd.read_csv("datasets/DataSet_CommonDiseases.csv", skiprows=2)
# Define clean column names
columns = [
'Disease',
'0-29d_M', '0-29d_F',
'1-11m_M', '1-11m_F',
'1-4y_M', '1-4y_F',
'5-9y_M', '5-9y_F',
'10-14y_M', '10-14y_F',
'15-19y_M', '15-19y_F',
'20-24y_M', '20-24y_F',
'25-49y_M', '25-49y_F',
'50-59y_M', '50-59y_F',
'60+y_M', '60+y_F'
]
df.columns = columns
df['Disease'] = df['Disease'].astype(str).str.strip()
# Melt to long format
df_melted = df.melt(
id_vars=['Disease'],
value_vars=[col for col in df.columns if col != 'Disease'],
var_name='AgeGender',
value_name='Cases'
)
# Extract AgeGroup and Gender
df_melted['AgeGroup'] = df_melted['AgeGender'].str.split('_').str[0]
df_melted['Gender'] = df_melted['AgeGender'].str.split('_').str[1]
# Clean and convert Cases to numeric — coerce errors to NaN
df_melted['Cases'] = pd.to_numeric(df_melted['Cases'], errors='coerce')
df_melted = df_melted.dropna(subset=['Cases']) # drop rows with NaN cases
# Pivot to get Male/Female side-by-side
scatter_data = df_melted.pivot_table(
index=['Disease', 'AgeGroup'],
columns='Gender',
values='Cases',
fill_value=0
).reset_index()
# Ensure columns exist
for col in ['M', 'F']:
if col not in scatter_data.columns:
scatter_data[col] = 0
# Select top prevalent diseases (avoid rare/noisy ones)
# Pick diseases with >5 age-gender points and reasonable case counts
disease_counts = scatter_data['Disease'].value_counts()
candidate_diseases = disease_counts[disease_counts >= 5].index.tolist()
# Filter for diseases with variation in data (non-zero range)
final_diseases = []
for d in candidate_diseases:
sub = scatter_data[scatter_data['Disease'] == d]
if (sub['M'].max() - sub['M'].min() > 0) and (sub['F'].max() - sub['F'].min() > 0):
final_diseases.append(d)
# Use a curated subset for clarity (~8 most representative)
diseases_of_interest = [
'Diarrhoea', 'Hypertension', 'Common Cold', 'DiabetesB',
'TuberculosisB', 'Scabies', 'Asthma', 'Peptic Ulcer Syndrome'
]
# Filter data
filtered = scatter_data[scatter_data['Disease'].isin(diseases_of_interest)].copy()
# Plot
plt.figure(figsize=(14, 9))
colors = plt.cm.tab10(np.linspace(0, 1, len(diseases_of_interest)))
for i, disease in enumerate(diseases_of_interest):
sub = filtered[filtered['Disease'] == disease]
x = sub['M'].values
y = sub['F'].values
if len(x) < 2:
continue
# Fit linear regression
model = LinearRegression()
model.fit(x.reshape(-1, 1), y)
y_pred = model.predict(x.reshape(-1, 1))
r2 = r2_score(y, y_pred)
# Scatter
plt.scatter(
x, y,
s=90, alpha=0.85,
color=colors[i],
edgecolor='k',
linewidth=0.8,
label=f'{disease} (R² = {r2:.2f})'
)
# Best-fit line (extend 10% beyond data range)
x_min, x_max = x.min(), x.max()
x_range = np.ptp(x) # ✅ Fixed: use np.ptp(), not x.ptp()
x_line = np.linspace(x_min - 0.1 * x_range, x_max + 0.1 * x_range, 100)
y_line = model.predict(x_line.reshape(-1, 1))
plt.plot(x_line, y_line, color=colors[i], linestyle='--', linewidth=1.8)
# Annotate age groups (optional, small font)
for _, row in sub.iterrows():
plt.text(
row['M'] + 0.5 * x_range / 10,
row['F'] + 0.5 * (y.max() - y.min()) / 10 if len(y) > 0 else 0,
row['AgeGroup'],
fontsize=7,
color=colors[i],
alpha=0.7
)
plt.xlabel('Male Cases (per age group)', fontsize=13)
plt.ylabel('Female Cases (per age group)', fontsize=13)
plt.title('Scatter Plot of Male vs Female Cases by Disease & Age Group\n(with Linear Best-Fit Lines)', fontsize=15)
plt.legend(title='Disease', fontsize=10, title_fontsize=11, loc='upper left')
plt.grid(True, linestyle=':', alpha=0.6)
plt.tight_layout()
plt.show()
log-log Scale Scatter Plot¶
In [8]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
# Load dataset — skip first 2 metadata lines
df = pd.read_csv("datasets/DataSet_CommonDiseases.csv", skiprows=2)
# Define clean column names
columns = [
'Disease',
'0-29d_M', '0-29d_F',
'1-11m_M', '1-11m_F',
'1-4y_M', '1-4y_F',
'5-9y_M', '5-9y_F',
'10-14y_M', '10-14y_F',
'15-19y_M', '15-19y_F',
'20-24y_M', '20-24y_F',
'25-49y_M', '25-49y_F',
'50-59y_M', '50-59y_F',
'60+y_M', '60+y_F'
]
df.columns = columns
df['Disease'] = df['Disease'].astype(str).str.strip()
# Melt to long format
df_melted = df.melt(
id_vars=['Disease'],
value_vars=[col for col in df.columns if col != 'Disease'],
var_name='AgeGender',
value_name='Cases'
)
# Extract AgeGroup and Gender
df_melted['AgeGroup'] = df_melted['AgeGender'].str.split('_').str[0]
df_melted['Gender'] = df_melted['AgeGender'].str.split('_').str[1]
# Clean and convert Cases to numeric — coerce errors to NaN
df_melted['Cases'] = pd.to_numeric(df_melted['Cases'], errors='coerce')
df_melted = df_melted.dropna(subset=['Cases']) # drop rows with NaN cases
# Pivot to get Male/Female side-by-side
scatter_data = df_melted.pivot_table(
index=['Disease', 'AgeGroup'],
columns='Gender',
values='Cases',
fill_value=0
).reset_index()
# Ensure columns exist
for col in ['M', 'F']:
if col not in scatter_data.columns:
scatter_data[col] = 0
# Select top prevalent diseases (avoid rare/noisy ones)
# Pick diseases with >5 age-gender points and reasonable case counts
disease_counts = scatter_data['Disease'].value_counts()
candidate_diseases = disease_counts[disease_counts >= 5].index.tolist()
# Filter for diseases with variation in data (non-zero range)
final_diseases = []
for d in candidate_diseases:
sub = scatter_data[scatter_data['Disease'] == d]
if (sub['M'].max() - sub['M'].min() > 0) and (sub['F'].max() - sub['F'].min() > 0):
final_diseases.append(d)
# Use a curated subset for clarity (~8 most representative)
diseases_of_interest = [
'Diarrhoea', 'Hypertension', 'Common Cold', 'DiabetesB',
'TuberculosisB', 'Scabies', 'Asthma', 'Peptic Ulcer Syndrome'
]
# Filter data
filtered = scatter_data[scatter_data['Disease'].isin(diseases_of_interest)].copy()
# Plot
plt.figure(figsize=(14, 9))
colors = plt.cm.tab10(np.linspace(0, 1, len(diseases_of_interest)))
for i, disease in enumerate(diseases_of_interest):
sub = filtered[filtered['Disease'] == disease]
x = sub['M'].values
y = sub['F'].values
if len(x) < 2:
continue
# Fit linear regression
model = LinearRegression()
model.fit(x.reshape(-1, 1), y)
y_pred = model.predict(x.reshape(-1, 1))
r2 = r2_score(y, y_pred)
# Scatter
plt.scatter(
x, y,
s=90, alpha=0.85,
color=colors[i],
edgecolor='k',
linewidth=0.8,
label=f'{disease} (R² = {r2:.2f})'
)
# Best-fit line (extend 10% beyond data range)
x_min, x_max = x.min(), x.max()
x_range = np.ptp(x) # ✅ Fixed: use np.ptp(), not x.ptp()
x_line = np.linspace(x_min - 0.1 * x_range, x_max + 0.1 * x_range, 100)
y_line = model.predict(x_line.reshape(-1, 1))
plt.plot(x_line, y_line, color=colors[i], linestyle='--', linewidth=1.8)
# Annotate age groups (optional, small font)
for _, row in sub.iterrows():
plt.text(
row['M'] + 0.5 * x_range / 10,
row['F'] + 0.5 * (y.max() - y.min()) / 10 if len(y) > 0 else 0,
row['AgeGroup'],
fontsize=7,
color=colors[i],
alpha=0.7
)
plt.xlabel('Male Cases (per age group)', fontsize=13)
plt.ylabel('Female Cases (per age group)', fontsize=13)
plt.title('Scatter Plot of Male vs Female Cases by Disease & Age Group\n(with Linear Best-Fit Lines)', fontsize=15)
plt.legend(title='Disease', fontsize=10, title_fontsize=11, loc='upper left')
plt.grid(True, linestyle=':', alpha=0.6)
plt.tight_layout()
plt.xscale('log')
plt.yscale('log')
plt.title('... (Log-Log Scale)')
plt.show()
In [ ]: