# Using NumPy, SciPy, Pandas' dataframes and Matplotlib

import numpy as np

import pandas as pd

from pandas import Series, DataFrame

%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import curve_fit

gh_df = pd.read_csv("datasets/UnifiedDataset.csv")

gh_df.head() # quick check of data import and structure

# Display basic info about missing values
print('Missing values per column:')
print(gh_df.isnull().sum())

Missing values per column:
Country                            0
Year                               0
Gender                             0
Life Expectancy                    0
Infant Mortality Rate           7593
                                ... 
Cereal Consumption Rice         9084
Diet Calories Animal Protein    9972
Diet Calories Plant Protein     9972
Diet Calories Fat               9972
Diet Calories Carbohydrates     9972
Length: 150, dtype: int64

# Histogram of Life Expectancy
plt.figure(figsize=(8, 5))
sns.histplot(gh_df['Life Expectancy'], kde=False, color='lightcoral')
plt.title('Distribution of Life Expectancy')
plt.xlabel('Life Expectancy')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

# Drop rows with missing Neonatal Mortality Rate
gh_df_clean = gh_df.dropna(subset=['Neonatal Mortality Rate']).copy()

def my_model_function(x, a, b):
   return a * x + b
popt, pcov = curve_fit(my_model_function, gh_df_clean['Neonatal Mortality Rate'], gh_df_clean['Life Expectancy'])
a_fit, b_fit = popt
print(f"Optimal affine parameters: a={a_fit}, b={b_fit}")

def my_model_function2(x, a, b, c):
   return a * x**2 + b * x + c
popt2, pcov2 = curve_fit(my_model_function2, gh_df_clean['Neonatal Mortality Rate'], gh_df_clean['Life Expectancy'])
a_fit2, b_fit2, c_fit2 = popt2
print(f"Optimal quadratic parameters: a={a_fit2}, b={b_fit2}, c={c_fit2}")

gh_unique = np.sort(gh_df_clean['Neonatal Mortality Rate'].unique())

ig, ax = plt.subplots(figsize=(10, 10))

plt.scatter(gh_df_clean['Neonatal Mortality Rate'],gh_df_clean['Life Expectancy'], color='grey', label='Original Data')
plt.plot(gh_unique, my_model_function(gh_unique, a_fit, b_fit), color='blue', label=f'Affine Fit')
plt.plot(gh_unique, my_model_function2(gh_unique, a_fit2, b_fit2, c_fit2), color='orange', label=f'Quadratic Fit')

plt.xlabel('Neonatal Mortality Rate')
plt.ylabel('Life Expectancy')
plt.title('Linear Polynomial Fit to DataFrame Columns')
plt.legend()
plt.show()

Optimal affine parameters: a=-0.5878962024919596, b=78.86242368593777
Optimal quadratic parameters: a=0.005208015496421006, b=-0.8446766683386308, c=80.71550146814968

from sklearn.neural_network import MLPRegressor
import numpy as np
X = gh_df_clean['Neonatal Mortality Rate'].to_numpy().reshape(-1,1)
y = gh_df_clean['Life Expectancy'].to_numpy()
mlpregress = MLPRegressor(solver='sgd',hidden_layer_sizes=(100),activation='tanh',random_state=1)
mlpregress.fit(X,y)
print(f"score: {mlpregress.score(X,y)}")
# X_test = np.sort(gh_df_clean['Neonatal Mortality Rate'].unique())
print("Predictions:")
X_test = np.unique(X).reshape(-1,1)
y_pred = np.c_[X_test,mlpregress.predict(X_test)]


#ig, ax = plt.subplots(figsize=(10, 10))

#plt.scatter(gh_df_clean['Neonatal Mortality Rate'],gh_df_clean['Life Expectancy'], color='grey', label='Original Data')
ig, ax = plt.subplots(figsize=(10, 10))

plt.scatter(gh_df_clean['Neonatal Mortality Rate'],gh_df_clean['Life Expectancy'], color='grey', label='Original Data')
plt.plot(gh_unique, my_model_function(gh_unique, a_fit, b_fit), color='blue', label=f'Affine Fit')
plt.plot(gh_unique, my_model_function2(gh_unique, a_fit2, b_fit2, c_fit2), color='orange', label=f'Quadratic Fit')
plt.plot(y_pred[:, 0], y_pred[:, 1], color='yellow', label=f'ML')

plt.xlabel('Neonatal Mortality Rate')
plt.ylabel('Life Expectancy')
plt.title('Fits to DataFrame Columns')
plt.legend()
plt.show()

y_pred

score: 0.6991411311845773
Predictions:

array([[ 0.78      , 82.63998852],
       [ 0.81      , 82.62772329],
       [ 0.84      , 82.58071411],
       ...,
       [71.43      , 55.51561612],
       [72.9       , 55.51561612],
       [74.31      , 55.51561612]], shape=(3103, 2))

	Country	Year	Gender	Life Expectancy	Infant Mortality Rate	Low CI Value Infant Mortality Rate	High CI Value Infant Mortality Rate	Under 5 Mortality Rate	Low CI Value Under 5 Mortality Rate	High CI Value Under 5 Mortality Rate	...	Cereal Consumption Rye	Cereal Consumption Barley	Cereal Consumption Sorghum	Cereal Consumption Maize	Cereal Consumption Wheat	Cereal Consumption Rice	Diet Calories Animal Protein	Diet Calories Plant Protein	Diet Calories Fat	Diet Calories Carbohydrates
0	Afghanistan	1990	Both sexes	50.331	120.4	111.2	130.9	177.7	162.5	194.3	...	NaN	103.0	NaN	201.0	1195.0	174.0	67.80	197.08	435.60	1613.52
1	Afghanistan	1990	Female	51.442	114.2	105.1	124.7	173.1	158.0	189.7	...	NaN	103.0	NaN	201.0	1195.0	174.0	67.80	197.08	435.60	1613.52
2	Afghanistan	1990	Male	49.281	126.2	116.4	137.5	182.0	166.6	199.3	...	NaN	103.0	NaN	201.0	1195.0	174.0	67.80	197.08	435.60	1613.52
3	Afghanistan	1991	Both sexes	50.999	116.8	108.2	126.2	171.7	157.6	186.9	...	NaN	94.0	NaN	164.0	1043.0	159.0	64.96	173.68	370.08	1435.28
4	Afghanistan	1991	Female	52.119	110.7	102.1	120.4	167.1	153.0	182.6	...	NaN	94.0	NaN	164.0	1043.0	159.0	64.96	173.68	370.08	1435.28

Class 4: Machine Learning¶

Assignment¶

Load Dataset¶

Visualize Dataset¶

Function Fit¶

Fit a machine learning model¶