# Using NumPy, SciPy, Pandas' dataframes and Matplotlib

import numpy as np

import pandas as pd

from pandas import Series, DataFrame

%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import curve_fit

gh_df = pd.read_csv("datasets/UnifiedDataset.csv")
gh_df.head() # quick check of data import and structure

file_list = ['datasets/lex.csv', 'datasets/gdp_pcap.csv', 'datasets/mincpcap_cppp.csv']
df_list = [pd.read_csv(filename) for filename in file_list]

life_expectancy = pd.read_csv('datasets/lex.csv', index_col="geo")
gdp_per_capita = pd.read_csv('datasets/gdp_pcap.csv', index_col="geo")
average_daily_income = pd.read_csv('datasets/mincpcap_cppp.csv', index_col="geo")
# print(life_expectancy.head())

#print(life_expectancy.info())
#print(gdp_per_capita.info())
print(average_daily_income.info())

#life_expectancy
#gdp_per_capita
average_daily_income

<class 'pandas.core.frame.DataFrame'>
Index: 193 entries, afg to zwe
Columns: 302 entries, name to 2100
dtypes: float64(301), object(1)
memory usage: 456.9+ KB
None

average_daily_income.iloc[0, range(1, 224, 1)]

1800    1.0208
1801    1.0208
1802    1.0208
1803    1.0208
1804    1.0208
         ...  
2018    5.2825
2019    5.3278
2020     5.041
2021    3.9025
2022    3.6068
Name: afg, Length: 223, dtype: object

# Display basic info about missing values
print('Missing values per column:')
print(life_expectancy.isnull().sum())

Missing values per column:
name     0
1800    10
1801    10
1802    10
1803    10
        ..
2096     1
2097     1
2098     1
2099     1
2100     1
Length: 302, dtype: int64

# Histogram of Life Expectancy
year = '2015'
plt.figure(figsize=(8, 5))
sns.histplot(life_expectancy[year], binwidth=5, kde=False, color='lightcoral')
# sns.histplot(average_daily_income['2000'], kde=False, color='lightcoral')

plt.title('Distribution of Life Expectancy in '+year)
plt.xlabel('Life Expectancy')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

# Histogram of Average Daily Income
plt.figure(figsize=(8, 5))
sns.histplot(average_daily_income[year], kde=False, color='firebrick')
plt.title('Distribution of Average Daily Income in '+year)
plt.xlabel('Average Daily Income')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

average_daily_income[year]

geo
afg     5.4014
ago     6.5517
alb    13.0307
and    61.1771
are    77.2049
        ...   
wsm    11.5848
yem     3.4980
zaf    12.1293
zmb     3.3653
zwe     5.2510
Name: 2015, Length: 193, dtype: float64

df_concat = pd.concat([average_daily_income[year], life_expectancy[year]], axis=1)
df_concat.columns = ['ADI', 'LE']
df_concat = df_concat.dropna() #

x_data = df_concat['ADI']
y_data = df_concat['LE']
# Ensure they have the same index (pandas series created this way share a default index)
# If your series have different indexes, align them first (e.g., using pd.concat or reindex)

# 2. Create the scatter plot using Matplotlib's object-oriented interface
fig, ax = plt.subplots(figsize=(9, 6))

# Use the values from the pandas Series for the scatter plot
ax.scatter(x_data.values, y_data.values, s=60, alpha=0.7, edgecolors="k")

# 3. Set the desired axis to a logarithmic scale
# For a log-linear plot, set the x-axis to 'log'
ax.set_xscale("log")
# The y-axis is linear by default

# 4. Add labels and a title for clarity
ax.set_xlabel('average_daily_income (log scale)')
ax.set_ylabel('life_expectancy (linear scale)')
ax.set_title('Log-Linear Scatter Plot of Average Daily Income vs Life Expectancy in '+year)
ax.grid(True, which="both", ls="--", linewidth=0.5) # Add grid lines

# 5. Display the plot
plt.show()

log_adi = np.log(average_daily_income[year])
df = pd.DataFrame({'ADI_log': log_adi, 'LE': life_expectancy[year]})

# Generate the scatter plot
df.plot.scatter(x='ADI_log', y='LE', title='Scatter Plot of log(Average Daily Income) vs Life Expectancy in '+year)


# Display the plot
plt.show()

df_concat = pd.concat([average_daily_income[year], life_expectancy[year]], axis=1)
df_concat.columns = ['ADI', 'LE']
df_concat = df_concat.dropna()

# Drop rows with missing ADI or LE
#df_clean = df_concat.dropna(subset=['ADI']).copy()
#df_clean = df_clean.dropna(subset=['LE']).copy()
df_clean = df_concat

x_data = np.log(df_clean['ADI'])
y_data = df_clean['LE']

def my_model_function(x, a, b):
   return a * x + b
popt, pcov = curve_fit(my_model_function, x_data, y_data)
a_fit, b_fit = popt
print(f"Optimal affine parameters: a={a_fit}, b={b_fit}")

def my_model_function2(x, a, b, c):
   return a * x**2 + b * x + c
popt2, pcov2 = curve_fit(my_model_function2, x_data, y_data)
a_fit2, b_fit2, c_fit2 = popt2
print(f"Optimal quadratic parameters: a={a_fit2}, b={b_fit2}, c={c_fit2}")

x_unique = np.sort(x_data.unique())

ig, ax = plt.subplots(figsize=(10, 10))

plt.scatter(x_data, y_data, color='grey', label='Original Data')

plt.plot(x_data, my_model_function(x_data, a_fit, b_fit), color='blue', label=f'Affine Fit')
plt.plot(x_unique, my_model_function2(x_unique, a_fit2, b_fit2, c_fit2), color='orange', label=f'Quadratic Fit')

plt.xlabel('log(Average Daily Income)')
plt.ylabel('Life Expectancy')
plt.title('Linear Polynomial Fit to DataFrame Columns')
plt.legend()

plt.show()

Optimal affine parameters: a=6.098487545060972, b=55.34704345235798
Optimal quadratic parameters: a=-0.9485700160740614, b=11.47292550169573, c=48.73246024913743

from sklearn.neural_network import MLPRegressor
import numpy as np
X = x_data.to_numpy().reshape(-1,1)
y = y_data.to_numpy()
mlpregress = MLPRegressor(solver='sgd',hidden_layer_sizes=(100),activation='tanh',random_state=1)
mlpregress.fit(X,y)
print(f"score: {mlpregress.score(X,y)}")
print("Predictions:")

X_test = np.unique(X).reshape(-1,1)
y_pred = np.c_[X_test,mlpregress.predict(X_test)]

ig, ax = plt.subplots(figsize=(10, 10))

plt.scatter(x_data, y_data, color='grey', label='Original Data')

plt.plot(x_data, my_model_function(x_data, a_fit, b_fit), color='blue', label=f'Affine Fit')
plt.plot(x_unique, my_model_function2(x_unique, a_fit2, b_fit2, c_fit2), color='orange', label=f'Quadratic Fit')
plt.plot(X_test, y_pred[:, 1], color='yellow', label=f'ML')

plt.xlabel('GDP per Capita')
plt.ylabel('Life Expectancy')
plt.title('Fits to DataFrame Columns')
plt.legend()
plt.show()

score: -0.2807243992121897
Predictions:

# prob_data = x_data
prob_data = life_expectancy[year]
npts = prob_data.size
mean = prob_data.mean()
stddev = prob_data.std()

#
# plot histogram and points
#
ig, ax = plt.subplots(figsize=(10, 10))


plt.hist(prob_data,bins=npts//10
        ,density=True)
plt.plot(prob_data,0*prob_data,'|',ms=npts/20)
#
# plot Gaussian
#
xi = np.linspace(mean-3*stddev,mean+3*stddev,100)
yi = np.exp(-(xi-mean)**2/(2*stddev**2))/np.sqrt(2*np.pi*stddev**2)
plt.plot(xi,yi,'r')
plt.show()

from scipy.spatial import Voronoi,voronoi_plot_2d
import time

df_concat = pd.concat([average_daily_income[year], life_expectancy[year]], axis=1)
df_concat.columns = ['ADI', 'LE']
df_clean = df_concat.dropna()

x_data = np.log(df_clean['ADI'])
y_data = df_clean['LE']

#
# k-means parameters
#
nsteps = 1000
momentum = 0.

x = x_data.to_numpy()
y = y_data.to_numpy()

def kmeans(x,y,momentum,nclusters):
    #
    # choose starting points
    #
    indices = np.random.uniform(low=0,high=len(x),size=nclusters).astype(int)
    mux = x[indices]
    muy = y[indices]
    #
    # do k-means iteration
    #
    for i in range(nsteps):
        #
        # find closest points
        #
        X = np.outer(x,np.ones(len(mux)))
        Y = np.outer(y,np.ones(len(muy)))
        Mux = np.outer(np.ones(len(x)),mux)
        Muy = np.outer(np.ones(len(x)),muy)
        distances = np.sqrt((X-Mux)**2+(Y-Muy)**2)
        mins = np.argmin(distances,axis=1)
        #
        # update means
        #
        for i in range(len(mux)):
            index = np.where(mins == i)
            mux[i] = np.sum(x[index])/len(index[0])
            muy[i] = np.sum(y[index])/len(index[0])
    #
    # find distances
    #
    distances = 0
    for i in range(len(mux)):
        index = np.where(mins == i)
        distances += np.sum(np.sqrt((x[index]-mux[i])**2+(y[index]-muy[i])**2))
    return mux,muy,distances

def plot_kmeans(x,y,mux,muy):
    xmin = np.min(x)
    xmax = np.max(x)
    ymin = np.min(y)
    ymax = np.max(y)
    fig,ax = plt.subplots()
    plt.plot(x,y,'.')
    plt.plot(mux,muy,'r.',markersize=20)
    plt.xlim(xmin,xmax)
    plt.ylim(ymin,ymax)
    plt.title(f"{len(mux)} clusters")
    plt.show()

def plot_Voronoi(x,y,mux,muy):
    xmin = np.min(x)
    xmax = np.max(x)
    ymin = np.min(y)
    ymax = np.max(y)
    fig,ax = plt.subplots()
    plt.plot(x,y,'.')
    vor = Voronoi(np.stack((mux,muy),axis=1))
    voronoi_plot_2d(vor,ax=ax,show_points=True,show_vertices=False,point_size=20)
    plt.xlim(xmin,xmax)
    plt.ylim(ymin,ymax)
    plt.title(f"{len(mux)} clusters")
    plt.show()

distances = np.zeros(5)

mux,muy,distances[0] = kmeans(x,y,momentum,1)
plot_kmeans(x,y,mux,muy)

mux,muy,distances[1] = kmeans(x,y,momentum,2)
plot_kmeans(x,y,mux,muy)

mux,muy,distances[2] = kmeans(x,y,momentum,3)
plot_Voronoi(x,y,mux,muy)

mux,muy,distances[3] = kmeans(x,y,momentum,4)
plot_Voronoi(x,y,mux,muy)

mux,muy,distances[4] = kmeans(x,y,momentum,5)
plot_Voronoi(x,y,mux,muy)

fig,ax = plt.subplots()
plt.plot(np.arange(1,6),distances,'o')
plt.xlabel('number of clusters')
plt.ylabel('total distances to clusters')
ax.xaxis.get_major_locator().set_params(integer=True)
plt.show()

	Country	Year	Gender	Life Expectancy	Infant Mortality Rate	Low CI Value Infant Mortality Rate	High CI Value Infant Mortality Rate	Under 5 Mortality Rate	Low CI Value Under 5 Mortality Rate	High CI Value Under 5 Mortality Rate	...	Cereal Consumption Rye	Cereal Consumption Barley	Cereal Consumption Sorghum	Cereal Consumption Maize	Cereal Consumption Wheat	Cereal Consumption Rice	Diet Calories Animal Protein	Diet Calories Plant Protein	Diet Calories Fat	Diet Calories Carbohydrates
0	Afghanistan	1990	Both sexes	50.331	120.4	111.2	130.9	177.7	162.5	194.3	...	NaN	103.0	NaN	201.0	1195.0	174.0	67.80	197.08	435.60	1613.52
1	Afghanistan	1990	Female	51.442	114.2	105.1	124.7	173.1	158.0	189.7	...	NaN	103.0	NaN	201.0	1195.0	174.0	67.80	197.08	435.60	1613.52
2	Afghanistan	1990	Male	49.281	126.2	116.4	137.5	182.0	166.6	199.3	...	NaN	103.0	NaN	201.0	1195.0	174.0	67.80	197.08	435.60	1613.52
3	Afghanistan	1991	Both sexes	50.999	116.8	108.2	126.2	171.7	157.6	186.9	...	NaN	94.0	NaN	164.0	1043.0	159.0	64.96	173.68	370.08	1435.28
4	Afghanistan	1991	Female	52.119	110.7	102.1	120.4	167.1	153.0	182.6	...	NaN	94.0	NaN	164.0	1043.0	159.0	64.96	173.68	370.08	1435.28

	name	1800	1801	1802	1803	1804	1805	1806	1807	1808	...	2091	2092	2093	2094	2095	2096	2097	2098	2099	2100
geo
afg	Afghanistan	1.0208	1.0208	1.0208	1.0208	1.0208	1.0208	1.0208	1.0208	1.0208	...	17.1045	17.5239	17.9534	18.3932	18.8435	19.3044	19.7761	20.2587	20.7524	21.2574
ago	Angola	0.6814	0.6836	0.6869	0.6903	0.6925	0.6958	0.6991	0.7024	0.7057	...	21.9803	22.3552	22.7321	23.1108	23.4911	23.8730	24.2562	24.6407	25.0263	25.4129
alb	Albania	0.8390	0.8415	0.8439	0.8463	0.8487	0.8511	0.8536	0.8560	0.8585	...	58.1668	58.6121	59.0526	59.4880	59.9185	60.3439	60.7642	61.1794	61.5895	61.9944
and	Andorra	1.5746	1.5773	1.5799	1.5839	1.5865	1.5891	1.5917	1.5944	1.5983	...	81.3023	81.3932	81.4820	81.5690	81.6541	81.7374	81.8189	81.8986	81.9766	82.0530
are	UAE	1.4255	1.4298	1.4355	1.4412	1.4469	1.4512	1.4569	1.4626	1.4683	...	96.5614	96.5443	96.5276	96.5113	96.4954	96.4799	96.4647	96.4499	96.4354	96.4213
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
wsm	Samoa	3.9445	3.9445	3.9473	3.9473	3.9473	3.9473	3.9473	3.9501	3.9501	...	56.6344	57.5937	58.5577	59.5260	60.4983	61.4744	62.4537	63.4360	64.4210	65.4082
yem	Yemen	2.9144	2.9211	2.9310	2.9377	2.9477	2.9543	2.9643	2.9709	2.9809	...	10.3401	10.5888	10.8440	11.1060	11.3749	11.6508	11.9340	12.2244	12.5224	12.8281
zaf	South Africa	1.6592	1.6407	1.6224	1.6043	1.5865	1.5029	1.5029	1.6299	1.4270	...	37.5816	38.0247	38.4660	38.9052	39.3424	39.7772	40.2097	40.6397	41.0671	41.4918
zmb	Zambia	1.0128	1.0158	1.0189	1.0204	1.0235	1.0250	1.0280	1.0311	1.0326	...	16.5543	16.9001	17.2503	17.6048	17.9636	18.3265	18.6936	19.0648	19.4398	19.8187
zwe	Zimbabwe	2.1063	2.1087	2.1111	2.1135	2.1160	2.1184	2.1208	2.1232	2.1257	...	29.4654	30.1061	30.7563	31.4158	32.0845	32.7624	33.4493	34.1452	34.8497	35.5629

Presentation (Transforms)¶

Assignment¶

Setup¶

Dataset¶

Analysis¶

Function Fit¶

Fit a machine learning model¶

Probability¶

Density Estimation¶