Blair Evans - Fab Futures - Data Science
Home About

< Home

Presentation (Transforms)¶

Assignment¶

  • Analyze your data set
  • prepare a notebook with the analysis of your data set, store it in your repo, and call it presentation.ipynb
  • include a 1920x1080 summary slide describing you, your data, and your analysis, store it your repo's images folder, and call it presentation.png

Setup¶

In [1]:
# Using NumPy, SciPy, Pandas' dataframes and Matplotlib

import numpy as np

import pandas as pd

from pandas import Series, DataFrame

%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import curve_fit

Dataset¶

I will use the Global Health, Nutrition, Mortality, Economic Data from Kaggle. The CSV for the dataset is named UnifiedDataset.csv

In [2]:
gh_df = pd.read_csv("datasets/UnifiedDataset.csv")
gh_df.head() # quick check of data import and structure
Out[2]:
Country Year Gender Life Expectancy Infant Mortality Rate Low CI Value Infant Mortality Rate High CI Value Infant Mortality Rate Under 5 Mortality Rate Low CI Value Under 5 Mortality Rate High CI Value Under 5 Mortality Rate ... Cereal Consumption Rye Cereal Consumption Barley Cereal Consumption Sorghum Cereal Consumption Maize Cereal Consumption Wheat Cereal Consumption Rice Diet Calories Animal Protein Diet Calories Plant Protein Diet Calories Fat Diet Calories Carbohydrates
0 Afghanistan 1990 Both sexes 50.331 120.4 111.2 130.9 177.7 162.5 194.3 ... NaN 103.0 NaN 201.0 1195.0 174.0 67.80 197.08 435.60 1613.52
1 Afghanistan 1990 Female 51.442 114.2 105.1 124.7 173.1 158.0 189.7 ... NaN 103.0 NaN 201.0 1195.0 174.0 67.80 197.08 435.60 1613.52
2 Afghanistan 1990 Male 49.281 126.2 116.4 137.5 182.0 166.6 199.3 ... NaN 103.0 NaN 201.0 1195.0 174.0 67.80 197.08 435.60 1613.52
3 Afghanistan 1991 Both sexes 50.999 116.8 108.2 126.2 171.7 157.6 186.9 ... NaN 94.0 NaN 164.0 1043.0 159.0 64.96 173.68 370.08 1435.28
4 Afghanistan 1991 Female 52.119 110.7 102.1 120.4 167.1 153.0 182.6 ... NaN 94.0 NaN 164.0 1043.0 159.0 64.96 173.68 370.08 1435.28

5 rows × 150 columns

In [3]:
file_list = ['datasets/lex.csv', 'datasets/gdp_pcap.csv', 'datasets/mincpcap_cppp.csv']
df_list = [pd.read_csv(filename) for filename in file_list]

life_expectancy = pd.read_csv('datasets/lex.csv', index_col="geo")
gdp_per_capita = pd.read_csv('datasets/gdp_pcap.csv', index_col="geo")
average_daily_income = pd.read_csv('datasets/mincpcap_cppp.csv', index_col="geo")
# print(life_expectancy.head())

#print(life_expectancy.info())
#print(gdp_per_capita.info())
print(average_daily_income.info())

#life_expectancy
#gdp_per_capita
average_daily_income
<class 'pandas.core.frame.DataFrame'>
Index: 193 entries, afg to zwe
Columns: 302 entries, name to 2100
dtypes: float64(301), object(1)
memory usage: 456.9+ KB
None
Out[3]:
name 1800 1801 1802 1803 1804 1805 1806 1807 1808 ... 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100
geo
afg Afghanistan 1.0208 1.0208 1.0208 1.0208 1.0208 1.0208 1.0208 1.0208 1.0208 ... 17.1045 17.5239 17.9534 18.3932 18.8435 19.3044 19.7761 20.2587 20.7524 21.2574
ago Angola 0.6814 0.6836 0.6869 0.6903 0.6925 0.6958 0.6991 0.7024 0.7057 ... 21.9803 22.3552 22.7321 23.1108 23.4911 23.8730 24.2562 24.6407 25.0263 25.4129
alb Albania 0.8390 0.8415 0.8439 0.8463 0.8487 0.8511 0.8536 0.8560 0.8585 ... 58.1668 58.6121 59.0526 59.4880 59.9185 60.3439 60.7642 61.1794 61.5895 61.9944
and Andorra 1.5746 1.5773 1.5799 1.5839 1.5865 1.5891 1.5917 1.5944 1.5983 ... 81.3023 81.3932 81.4820 81.5690 81.6541 81.7374 81.8189 81.8986 81.9766 82.0530
are UAE 1.4255 1.4298 1.4355 1.4412 1.4469 1.4512 1.4569 1.4626 1.4683 ... 96.5614 96.5443 96.5276 96.5113 96.4954 96.4799 96.4647 96.4499 96.4354 96.4213
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
wsm Samoa 3.9445 3.9445 3.9473 3.9473 3.9473 3.9473 3.9473 3.9501 3.9501 ... 56.6344 57.5937 58.5577 59.5260 60.4983 61.4744 62.4537 63.4360 64.4210 65.4082
yem Yemen 2.9144 2.9211 2.9310 2.9377 2.9477 2.9543 2.9643 2.9709 2.9809 ... 10.3401 10.5888 10.8440 11.1060 11.3749 11.6508 11.9340 12.2244 12.5224 12.8281
zaf South Africa 1.6592 1.6407 1.6224 1.6043 1.5865 1.5029 1.5029 1.6299 1.4270 ... 37.5816 38.0247 38.4660 38.9052 39.3424 39.7772 40.2097 40.6397 41.0671 41.4918
zmb Zambia 1.0128 1.0158 1.0189 1.0204 1.0235 1.0250 1.0280 1.0311 1.0326 ... 16.5543 16.9001 17.2503 17.6048 17.9636 18.3265 18.6936 19.0648 19.4398 19.8187
zwe Zimbabwe 2.1063 2.1087 2.1111 2.1135 2.1160 2.1184 2.1208 2.1232 2.1257 ... 29.4654 30.1061 30.7563 31.4158 32.0845 32.7624 33.4493 34.1452 34.8497 35.5629

193 rows × 302 columns

In [4]:
average_daily_income.iloc[0, range(1, 224, 1)]
Out[4]:
1800    1.0208
1801    1.0208
1802    1.0208
1803    1.0208
1804    1.0208
         ...  
2018    5.2825
2019    5.3278
2020     5.041
2021    3.9025
2022    3.6068
Name: afg, Length: 223, dtype: object
In [5]:
# Display basic info about missing values
print('Missing values per column:')
print(life_expectancy.isnull().sum())
Missing values per column:
name     0
1800    10
1801    10
1802    10
1803    10
        ..
2096     1
2097     1
2098     1
2099     1
2100     1
Length: 302, dtype: int64

Analysis¶

In [6]:
# Histogram of Life Expectancy
year = '2015'
plt.figure(figsize=(8, 5))
sns.histplot(life_expectancy[year], binwidth=5, kde=False, color='lightcoral')
# sns.histplot(average_daily_income['2000'], kde=False, color='lightcoral')

plt.title('Distribution of Life Expectancy in '+year)
plt.xlabel('Life Expectancy')
plt.ylabel('Count')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [7]:
# Histogram of Average Daily Income
plt.figure(figsize=(8, 5))
sns.histplot(average_daily_income[year], kde=False, color='firebrick')
plt.title('Distribution of Average Daily Income in '+year)
plt.xlabel('Average Daily Income')
plt.ylabel('Count')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [8]:
average_daily_income[year]
Out[8]:
geo
afg     5.4014
ago     6.5517
alb    13.0307
and    61.1771
are    77.2049
        ...   
wsm    11.5848
yem     3.4980
zaf    12.1293
zmb     3.3653
zwe     5.2510
Name: 2015, Length: 193, dtype: float64
In [9]:
df_concat = pd.concat([average_daily_income[year], life_expectancy[year]], axis=1)
df_concat.columns = ['ADI', 'LE']
df_concat = df_concat.dropna() #

x_data = df_concat['ADI']
y_data = df_concat['LE']
# Ensure they have the same index (pandas series created this way share a default index)
# If your series have different indexes, align them first (e.g., using pd.concat or reindex)

# 2. Create the scatter plot using Matplotlib's object-oriented interface
fig, ax = plt.subplots(figsize=(9, 6))

# Use the values from the pandas Series for the scatter plot
ax.scatter(x_data.values, y_data.values, s=60, alpha=0.7, edgecolors="k")

# 3. Set the desired axis to a logarithmic scale
# For a log-linear plot, set the x-axis to 'log'
ax.set_xscale("log")
# The y-axis is linear by default

# 4. Add labels and a title for clarity
ax.set_xlabel('average_daily_income (log scale)')
ax.set_ylabel('life_expectancy (linear scale)')
ax.set_title('Log-Linear Scatter Plot of Average Daily Income vs Life Expectancy in '+year)
ax.grid(True, which="both", ls="--", linewidth=0.5) # Add grid lines

# 5. Display the plot
plt.show()
No description has been provided for this image
In [10]:
log_adi = np.log(average_daily_income[year])
df = pd.DataFrame({'ADI_log': log_adi, 'LE': life_expectancy[year]})

# Generate the scatter plot
df.plot.scatter(x='ADI_log', y='LE', title='Scatter Plot of log(Average Daily Income) vs Life Expectancy in '+year)


# Display the plot
plt.show()
No description has been provided for this image

Function Fit¶

Look at Life Expectancy vs Average Daily Income. Look at a scatter plot of the raw data and linear and quadratic fits to the data.

In [11]:
df_concat = pd.concat([average_daily_income[year], life_expectancy[year]], axis=1)
df_concat.columns = ['ADI', 'LE']
df_concat = df_concat.dropna()

# Drop rows with missing ADI or LE
#df_clean = df_concat.dropna(subset=['ADI']).copy()
#df_clean = df_clean.dropna(subset=['LE']).copy()
df_clean = df_concat

x_data = np.log(df_clean['ADI'])
y_data = df_clean['LE']

def my_model_function(x, a, b):
   return a * x + b
popt, pcov = curve_fit(my_model_function, x_data, y_data)
a_fit, b_fit = popt
print(f"Optimal affine parameters: a={a_fit}, b={b_fit}")

def my_model_function2(x, a, b, c):
   return a * x**2 + b * x + c
popt2, pcov2 = curve_fit(my_model_function2, x_data, y_data)
a_fit2, b_fit2, c_fit2 = popt2
print(f"Optimal quadratic parameters: a={a_fit2}, b={b_fit2}, c={c_fit2}")

x_unique = np.sort(x_data.unique())

ig, ax = plt.subplots(figsize=(10, 10))

plt.scatter(x_data, y_data, color='grey', label='Original Data')

plt.plot(x_data, my_model_function(x_data, a_fit, b_fit), color='blue', label=f'Affine Fit')
plt.plot(x_unique, my_model_function2(x_unique, a_fit2, b_fit2, c_fit2), color='orange', label=f'Quadratic Fit')

plt.xlabel('log(Average Daily Income)')
plt.ylabel('Life Expectancy')
plt.title('Linear Polynomial Fit to DataFrame Columns')
plt.legend()

plt.show()
Optimal affine parameters: a=6.098487545060972, b=55.34704345235798
Optimal quadratic parameters: a=-0.9485700160740614, b=11.47292550169573, c=48.73246024913743
No description has been provided for this image

Fit a machine learning model¶

In [12]:
from sklearn.neural_network import MLPRegressor
import numpy as np
X = x_data.to_numpy().reshape(-1,1)
y = y_data.to_numpy()
mlpregress = MLPRegressor(solver='sgd',hidden_layer_sizes=(100),activation='tanh',random_state=1)
mlpregress.fit(X,y)
print(f"score: {mlpregress.score(X,y)}")
print("Predictions:")

X_test = np.unique(X).reshape(-1,1)
y_pred = np.c_[X_test,mlpregress.predict(X_test)]

ig, ax = plt.subplots(figsize=(10, 10))

plt.scatter(x_data, y_data, color='grey', label='Original Data')

plt.plot(x_data, my_model_function(x_data, a_fit, b_fit), color='blue', label=f'Affine Fit')
plt.plot(x_unique, my_model_function2(x_unique, a_fit2, b_fit2, c_fit2), color='orange', label=f'Quadratic Fit')
plt.plot(X_test, y_pred[:, 1], color='yellow', label=f'ML')

plt.xlabel('GDP per Capita')
plt.ylabel('Life Expectancy')
plt.title('Fits to DataFrame Columns')
plt.legend()
plt.show()
score: -0.2807243992121897
Predictions:
No description has been provided for this image

Probability¶

In [13]:
# prob_data = x_data
prob_data = life_expectancy[year]
npts = prob_data.size
mean = prob_data.mean()
stddev = prob_data.std()

#
# plot histogram and points
#
ig, ax = plt.subplots(figsize=(10, 10))


plt.hist(prob_data,bins=npts//10
        ,density=True)
plt.plot(prob_data,0*prob_data,'|',ms=npts/20)
#
# plot Gaussian
#
xi = np.linspace(mean-3*stddev,mean+3*stddev,100)
yi = np.exp(-(xi-mean)**2/(2*stddev**2))/np.sqrt(2*np.pi*stddev**2)
plt.plot(xi,yi,'r')
plt.show()
No description has been provided for this image

Density Estimation¶

Begining to work on this, more is needed debug cluster size of 2. Note that example code has errors in def plot_kmeans(x,y,mux,muy) and def plot_Voronoi(x,y,mux,muy) where plt.ylim() is defined by x values instead of y.

In [ ]:
 
In [14]:
from scipy.spatial import Voronoi,voronoi_plot_2d
import time

df_concat = pd.concat([average_daily_income[year], life_expectancy[year]], axis=1)
df_concat.columns = ['ADI', 'LE']
df_clean = df_concat.dropna()

x_data = np.log(df_clean['ADI'])
y_data = df_clean['LE']

#
# k-means parameters
#
nsteps = 1000
momentum = 0.

x = x_data.to_numpy()
y = y_data.to_numpy()

def kmeans(x,y,momentum,nclusters):
    #
    # choose starting points
    #
    indices = np.random.uniform(low=0,high=len(x),size=nclusters).astype(int)
    mux = x[indices]
    muy = y[indices]
    #
    # do k-means iteration
    #
    for i in range(nsteps):
        #
        # find closest points
        #
        X = np.outer(x,np.ones(len(mux)))
        Y = np.outer(y,np.ones(len(muy)))
        Mux = np.outer(np.ones(len(x)),mux)
        Muy = np.outer(np.ones(len(x)),muy)
        distances = np.sqrt((X-Mux)**2+(Y-Muy)**2)
        mins = np.argmin(distances,axis=1)
        #
        # update means
        #
        for i in range(len(mux)):
            index = np.where(mins == i)
            mux[i] = np.sum(x[index])/len(index[0])
            muy[i] = np.sum(y[index])/len(index[0])
    #
    # find distances
    #
    distances = 0
    for i in range(len(mux)):
        index = np.where(mins == i)
        distances += np.sum(np.sqrt((x[index]-mux[i])**2+(y[index]-muy[i])**2))
    return mux,muy,distances

def plot_kmeans(x,y,mux,muy):
    xmin = np.min(x)
    xmax = np.max(x)
    ymin = np.min(y)
    ymax = np.max(y)
    fig,ax = plt.subplots()
    plt.plot(x,y,'.')
    plt.plot(mux,muy,'r.',markersize=20)
    plt.xlim(xmin,xmax)
    plt.ylim(ymin,ymax)
    plt.title(f"{len(mux)} clusters")
    plt.show()

def plot_Voronoi(x,y,mux,muy):
    xmin = np.min(x)
    xmax = np.max(x)
    ymin = np.min(y)
    ymax = np.max(y)
    fig,ax = plt.subplots()
    plt.plot(x,y,'.')
    vor = Voronoi(np.stack((mux,muy),axis=1))
    voronoi_plot_2d(vor,ax=ax,show_points=True,show_vertices=False,point_size=20)
    plt.xlim(xmin,xmax)
    plt.ylim(ymin,ymax)
    plt.title(f"{len(mux)} clusters")
    plt.show()

distances = np.zeros(5)

mux,muy,distances[0] = kmeans(x,y,momentum,1)
plot_kmeans(x,y,mux,muy)

mux,muy,distances[1] = kmeans(x,y,momentum,2)
plot_kmeans(x,y,mux,muy)

mux,muy,distances[2] = kmeans(x,y,momentum,3)
plot_Voronoi(x,y,mux,muy)

mux,muy,distances[3] = kmeans(x,y,momentum,4)
plot_Voronoi(x,y,mux,muy)

mux,muy,distances[4] = kmeans(x,y,momentum,5)
plot_Voronoi(x,y,mux,muy)

fig,ax = plt.subplots()
plt.plot(np.arange(1,6),distances,'o')
plt.xlabel('number of clusters')
plt.ylabel('total distances to clusters')
ax.xaxis.get_major_locator().set_params(integer=True)
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]: