< Home
Presentation (Transforms)¶
Assignment¶
- Analyze your data set
- prepare a notebook with the analysis of your data set, store it in your repo, and call it presentation.ipynb
- include a 1920x1080 summary slide describing you, your data, and your analysis, store it your repo's images folder, and call it presentation.png
Setup¶
In [1]:
# Using NumPy, SciPy, Pandas' dataframes and Matplotlib
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import curve_fit
Dataset¶
I will use the Global Health, Nutrition, Mortality, Economic Data from Kaggle. The CSV for the dataset is named UnifiedDataset.csv
In [2]:
gh_df = pd.read_csv("datasets/UnifiedDataset.csv")
gh_df.head() # quick check of data import and structure
Out[2]:
| Country | Year | Gender | Life Expectancy | Infant Mortality Rate | Low CI Value Infant Mortality Rate | High CI Value Infant Mortality Rate | Under 5 Mortality Rate | Low CI Value Under 5 Mortality Rate | High CI Value Under 5 Mortality Rate | ... | Cereal Consumption Rye | Cereal Consumption Barley | Cereal Consumption Sorghum | Cereal Consumption Maize | Cereal Consumption Wheat | Cereal Consumption Rice | Diet Calories Animal Protein | Diet Calories Plant Protein | Diet Calories Fat | Diet Calories Carbohydrates | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | 1990 | Both sexes | 50.331 | 120.4 | 111.2 | 130.9 | 177.7 | 162.5 | 194.3 | ... | NaN | 103.0 | NaN | 201.0 | 1195.0 | 174.0 | 67.80 | 197.08 | 435.60 | 1613.52 |
| 1 | Afghanistan | 1990 | Female | 51.442 | 114.2 | 105.1 | 124.7 | 173.1 | 158.0 | 189.7 | ... | NaN | 103.0 | NaN | 201.0 | 1195.0 | 174.0 | 67.80 | 197.08 | 435.60 | 1613.52 |
| 2 | Afghanistan | 1990 | Male | 49.281 | 126.2 | 116.4 | 137.5 | 182.0 | 166.6 | 199.3 | ... | NaN | 103.0 | NaN | 201.0 | 1195.0 | 174.0 | 67.80 | 197.08 | 435.60 | 1613.52 |
| 3 | Afghanistan | 1991 | Both sexes | 50.999 | 116.8 | 108.2 | 126.2 | 171.7 | 157.6 | 186.9 | ... | NaN | 94.0 | NaN | 164.0 | 1043.0 | 159.0 | 64.96 | 173.68 | 370.08 | 1435.28 |
| 4 | Afghanistan | 1991 | Female | 52.119 | 110.7 | 102.1 | 120.4 | 167.1 | 153.0 | 182.6 | ... | NaN | 94.0 | NaN | 164.0 | 1043.0 | 159.0 | 64.96 | 173.68 | 370.08 | 1435.28 |
5 rows × 150 columns
In [3]:
file_list = ['datasets/lex.csv', 'datasets/gdp_pcap.csv', 'datasets/mincpcap_cppp.csv']
df_list = [pd.read_csv(filename) for filename in file_list]
life_expectancy = pd.read_csv('datasets/lex.csv', index_col="geo")
gdp_per_capita = pd.read_csv('datasets/gdp_pcap.csv', index_col="geo")
average_daily_income = pd.read_csv('datasets/mincpcap_cppp.csv', index_col="geo")
# print(life_expectancy.head())
#print(life_expectancy.info())
#print(gdp_per_capita.info())
print(average_daily_income.info())
#life_expectancy
#gdp_per_capita
average_daily_income
<class 'pandas.core.frame.DataFrame'> Index: 193 entries, afg to zwe Columns: 302 entries, name to 2100 dtypes: float64(301), object(1) memory usage: 456.9+ KB None
Out[3]:
| name | 1800 | 1801 | 1802 | 1803 | 1804 | 1805 | 1806 | 1807 | 1808 | ... | 2091 | 2092 | 2093 | 2094 | 2095 | 2096 | 2097 | 2098 | 2099 | 2100 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| geo | |||||||||||||||||||||
| afg | Afghanistan | 1.0208 | 1.0208 | 1.0208 | 1.0208 | 1.0208 | 1.0208 | 1.0208 | 1.0208 | 1.0208 | ... | 17.1045 | 17.5239 | 17.9534 | 18.3932 | 18.8435 | 19.3044 | 19.7761 | 20.2587 | 20.7524 | 21.2574 |
| ago | Angola | 0.6814 | 0.6836 | 0.6869 | 0.6903 | 0.6925 | 0.6958 | 0.6991 | 0.7024 | 0.7057 | ... | 21.9803 | 22.3552 | 22.7321 | 23.1108 | 23.4911 | 23.8730 | 24.2562 | 24.6407 | 25.0263 | 25.4129 |
| alb | Albania | 0.8390 | 0.8415 | 0.8439 | 0.8463 | 0.8487 | 0.8511 | 0.8536 | 0.8560 | 0.8585 | ... | 58.1668 | 58.6121 | 59.0526 | 59.4880 | 59.9185 | 60.3439 | 60.7642 | 61.1794 | 61.5895 | 61.9944 |
| and | Andorra | 1.5746 | 1.5773 | 1.5799 | 1.5839 | 1.5865 | 1.5891 | 1.5917 | 1.5944 | 1.5983 | ... | 81.3023 | 81.3932 | 81.4820 | 81.5690 | 81.6541 | 81.7374 | 81.8189 | 81.8986 | 81.9766 | 82.0530 |
| are | UAE | 1.4255 | 1.4298 | 1.4355 | 1.4412 | 1.4469 | 1.4512 | 1.4569 | 1.4626 | 1.4683 | ... | 96.5614 | 96.5443 | 96.5276 | 96.5113 | 96.4954 | 96.4799 | 96.4647 | 96.4499 | 96.4354 | 96.4213 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| wsm | Samoa | 3.9445 | 3.9445 | 3.9473 | 3.9473 | 3.9473 | 3.9473 | 3.9473 | 3.9501 | 3.9501 | ... | 56.6344 | 57.5937 | 58.5577 | 59.5260 | 60.4983 | 61.4744 | 62.4537 | 63.4360 | 64.4210 | 65.4082 |
| yem | Yemen | 2.9144 | 2.9211 | 2.9310 | 2.9377 | 2.9477 | 2.9543 | 2.9643 | 2.9709 | 2.9809 | ... | 10.3401 | 10.5888 | 10.8440 | 11.1060 | 11.3749 | 11.6508 | 11.9340 | 12.2244 | 12.5224 | 12.8281 |
| zaf | South Africa | 1.6592 | 1.6407 | 1.6224 | 1.6043 | 1.5865 | 1.5029 | 1.5029 | 1.6299 | 1.4270 | ... | 37.5816 | 38.0247 | 38.4660 | 38.9052 | 39.3424 | 39.7772 | 40.2097 | 40.6397 | 41.0671 | 41.4918 |
| zmb | Zambia | 1.0128 | 1.0158 | 1.0189 | 1.0204 | 1.0235 | 1.0250 | 1.0280 | 1.0311 | 1.0326 | ... | 16.5543 | 16.9001 | 17.2503 | 17.6048 | 17.9636 | 18.3265 | 18.6936 | 19.0648 | 19.4398 | 19.8187 |
| zwe | Zimbabwe | 2.1063 | 2.1087 | 2.1111 | 2.1135 | 2.1160 | 2.1184 | 2.1208 | 2.1232 | 2.1257 | ... | 29.4654 | 30.1061 | 30.7563 | 31.4158 | 32.0845 | 32.7624 | 33.4493 | 34.1452 | 34.8497 | 35.5629 |
193 rows × 302 columns
In [4]:
average_daily_income.iloc[0, range(1, 224, 1)]
Out[4]:
1800 1.0208
1801 1.0208
1802 1.0208
1803 1.0208
1804 1.0208
...
2018 5.2825
2019 5.3278
2020 5.041
2021 3.9025
2022 3.6068
Name: afg, Length: 223, dtype: object
In [5]:
# Display basic info about missing values
print('Missing values per column:')
print(life_expectancy.isnull().sum())
Missing values per column:
name 0
1800 10
1801 10
1802 10
1803 10
..
2096 1
2097 1
2098 1
2099 1
2100 1
Length: 302, dtype: int64
Analysis¶
In [6]:
# Histogram of Life Expectancy
year = '2015'
plt.figure(figsize=(8, 5))
sns.histplot(life_expectancy[year], binwidth=5, kde=False, color='lightcoral')
# sns.histplot(average_daily_income['2000'], kde=False, color='lightcoral')
plt.title('Distribution of Life Expectancy in '+year)
plt.xlabel('Life Expectancy')
plt.ylabel('Count')
plt.tight_layout()
plt.show()
In [7]:
# Histogram of Average Daily Income
plt.figure(figsize=(8, 5))
sns.histplot(average_daily_income[year], kde=False, color='firebrick')
plt.title('Distribution of Average Daily Income in '+year)
plt.xlabel('Average Daily Income')
plt.ylabel('Count')
plt.tight_layout()
plt.show()
In [8]:
average_daily_income[year]
Out[8]:
geo
afg 5.4014
ago 6.5517
alb 13.0307
and 61.1771
are 77.2049
...
wsm 11.5848
yem 3.4980
zaf 12.1293
zmb 3.3653
zwe 5.2510
Name: 2015, Length: 193, dtype: float64
In [9]:
df_concat = pd.concat([average_daily_income[year], life_expectancy[year]], axis=1)
df_concat.columns = ['ADI', 'LE']
df_concat = df_concat.dropna() #
x_data = df_concat['ADI']
y_data = df_concat['LE']
# Ensure they have the same index (pandas series created this way share a default index)
# If your series have different indexes, align them first (e.g., using pd.concat or reindex)
# 2. Create the scatter plot using Matplotlib's object-oriented interface
fig, ax = plt.subplots(figsize=(9, 6))
# Use the values from the pandas Series for the scatter plot
ax.scatter(x_data.values, y_data.values, s=60, alpha=0.7, edgecolors="k")
# 3. Set the desired axis to a logarithmic scale
# For a log-linear plot, set the x-axis to 'log'
ax.set_xscale("log")
# The y-axis is linear by default
# 4. Add labels and a title for clarity
ax.set_xlabel('average_daily_income (log scale)')
ax.set_ylabel('life_expectancy (linear scale)')
ax.set_title('Log-Linear Scatter Plot of Average Daily Income vs Life Expectancy in '+year)
ax.grid(True, which="both", ls="--", linewidth=0.5) # Add grid lines
# 5. Display the plot
plt.show()
In [10]:
log_adi = np.log(average_daily_income[year])
df = pd.DataFrame({'ADI_log': log_adi, 'LE': life_expectancy[year]})
# Generate the scatter plot
df.plot.scatter(x='ADI_log', y='LE', title='Scatter Plot of log(Average Daily Income) vs Life Expectancy in '+year)
# Display the plot
plt.show()
Function Fit¶
Look at Life Expectancy vs Average Daily Income. Look at a scatter plot of the raw data and linear and quadratic fits to the data.
In [11]:
df_concat = pd.concat([average_daily_income[year], life_expectancy[year]], axis=1)
df_concat.columns = ['ADI', 'LE']
df_concat = df_concat.dropna()
# Drop rows with missing ADI or LE
#df_clean = df_concat.dropna(subset=['ADI']).copy()
#df_clean = df_clean.dropna(subset=['LE']).copy()
df_clean = df_concat
x_data = np.log(df_clean['ADI'])
y_data = df_clean['LE']
def my_model_function(x, a, b):
return a * x + b
popt, pcov = curve_fit(my_model_function, x_data, y_data)
a_fit, b_fit = popt
print(f"Optimal affine parameters: a={a_fit}, b={b_fit}")
def my_model_function2(x, a, b, c):
return a * x**2 + b * x + c
popt2, pcov2 = curve_fit(my_model_function2, x_data, y_data)
a_fit2, b_fit2, c_fit2 = popt2
print(f"Optimal quadratic parameters: a={a_fit2}, b={b_fit2}, c={c_fit2}")
x_unique = np.sort(x_data.unique())
ig, ax = plt.subplots(figsize=(10, 10))
plt.scatter(x_data, y_data, color='grey', label='Original Data')
plt.plot(x_data, my_model_function(x_data, a_fit, b_fit), color='blue', label=f'Affine Fit')
plt.plot(x_unique, my_model_function2(x_unique, a_fit2, b_fit2, c_fit2), color='orange', label=f'Quadratic Fit')
plt.xlabel('log(Average Daily Income)')
plt.ylabel('Life Expectancy')
plt.title('Linear Polynomial Fit to DataFrame Columns')
plt.legend()
plt.show()
Optimal affine parameters: a=6.098487545060972, b=55.34704345235798 Optimal quadratic parameters: a=-0.9485700160740614, b=11.47292550169573, c=48.73246024913743
Fit a machine learning model¶
In [12]:
from sklearn.neural_network import MLPRegressor
import numpy as np
X = x_data.to_numpy().reshape(-1,1)
y = y_data.to_numpy()
mlpregress = MLPRegressor(solver='sgd',hidden_layer_sizes=(100),activation='tanh',random_state=1)
mlpregress.fit(X,y)
print(f"score: {mlpregress.score(X,y)}")
print("Predictions:")
X_test = np.unique(X).reshape(-1,1)
y_pred = np.c_[X_test,mlpregress.predict(X_test)]
ig, ax = plt.subplots(figsize=(10, 10))
plt.scatter(x_data, y_data, color='grey', label='Original Data')
plt.plot(x_data, my_model_function(x_data, a_fit, b_fit), color='blue', label=f'Affine Fit')
plt.plot(x_unique, my_model_function2(x_unique, a_fit2, b_fit2, c_fit2), color='orange', label=f'Quadratic Fit')
plt.plot(X_test, y_pred[:, 1], color='yellow', label=f'ML')
plt.xlabel('GDP per Capita')
plt.ylabel('Life Expectancy')
plt.title('Fits to DataFrame Columns')
plt.legend()
plt.show()
score: -0.2807243992121897 Predictions:
Probability¶
In [13]:
# prob_data = x_data
prob_data = life_expectancy[year]
npts = prob_data.size
mean = prob_data.mean()
stddev = prob_data.std()
#
# plot histogram and points
#
ig, ax = plt.subplots(figsize=(10, 10))
plt.hist(prob_data,bins=npts//10
,density=True)
plt.plot(prob_data,0*prob_data,'|',ms=npts/20)
#
# plot Gaussian
#
xi = np.linspace(mean-3*stddev,mean+3*stddev,100)
yi = np.exp(-(xi-mean)**2/(2*stddev**2))/np.sqrt(2*np.pi*stddev**2)
plt.plot(xi,yi,'r')
plt.show()
Density Estimation¶
Begining to work on this, more is needed debug cluster size of 2. Note that example code has errors in def plot_kmeans(x,y,mux,muy) and def plot_Voronoi(x,y,mux,muy) where plt.ylim() is defined by x values instead of y.
In [ ]:
In [14]:
from scipy.spatial import Voronoi,voronoi_plot_2d
import time
df_concat = pd.concat([average_daily_income[year], life_expectancy[year]], axis=1)
df_concat.columns = ['ADI', 'LE']
df_clean = df_concat.dropna()
x_data = np.log(df_clean['ADI'])
y_data = df_clean['LE']
#
# k-means parameters
#
nsteps = 1000
momentum = 0.
x = x_data.to_numpy()
y = y_data.to_numpy()
def kmeans(x,y,momentum,nclusters):
#
# choose starting points
#
indices = np.random.uniform(low=0,high=len(x),size=nclusters).astype(int)
mux = x[indices]
muy = y[indices]
#
# do k-means iteration
#
for i in range(nsteps):
#
# find closest points
#
X = np.outer(x,np.ones(len(mux)))
Y = np.outer(y,np.ones(len(muy)))
Mux = np.outer(np.ones(len(x)),mux)
Muy = np.outer(np.ones(len(x)),muy)
distances = np.sqrt((X-Mux)**2+(Y-Muy)**2)
mins = np.argmin(distances,axis=1)
#
# update means
#
for i in range(len(mux)):
index = np.where(mins == i)
mux[i] = np.sum(x[index])/len(index[0])
muy[i] = np.sum(y[index])/len(index[0])
#
# find distances
#
distances = 0
for i in range(len(mux)):
index = np.where(mins == i)
distances += np.sum(np.sqrt((x[index]-mux[i])**2+(y[index]-muy[i])**2))
return mux,muy,distances
def plot_kmeans(x,y,mux,muy):
xmin = np.min(x)
xmax = np.max(x)
ymin = np.min(y)
ymax = np.max(y)
fig,ax = plt.subplots()
plt.plot(x,y,'.')
plt.plot(mux,muy,'r.',markersize=20)
plt.xlim(xmin,xmax)
plt.ylim(ymin,ymax)
plt.title(f"{len(mux)} clusters")
plt.show()
def plot_Voronoi(x,y,mux,muy):
xmin = np.min(x)
xmax = np.max(x)
ymin = np.min(y)
ymax = np.max(y)
fig,ax = plt.subplots()
plt.plot(x,y,'.')
vor = Voronoi(np.stack((mux,muy),axis=1))
voronoi_plot_2d(vor,ax=ax,show_points=True,show_vertices=False,point_size=20)
plt.xlim(xmin,xmax)
plt.ylim(ymin,ymax)
plt.title(f"{len(mux)} clusters")
plt.show()
distances = np.zeros(5)
mux,muy,distances[0] = kmeans(x,y,momentum,1)
plot_kmeans(x,y,mux,muy)
mux,muy,distances[1] = kmeans(x,y,momentum,2)
plot_kmeans(x,y,mux,muy)
mux,muy,distances[2] = kmeans(x,y,momentum,3)
plot_Voronoi(x,y,mux,muy)
mux,muy,distances[3] = kmeans(x,y,momentum,4)
plot_Voronoi(x,y,mux,muy)
mux,muy,distances[4] = kmeans(x,y,momentum,5)
plot_Voronoi(x,y,mux,muy)
fig,ax = plt.subplots()
plt.plot(np.arange(1,6),distances,'o')
plt.xlabel('number of clusters')
plt.ylabel('total distances to clusters')
ax.xaxis.get_major_locator().set_params(integer=True)
plt.show()
In [ ]: