Simple Statistical Analysis Example¶
Here is the basic statistic library sample code using SDGs data.
OK, let's start from reading data
import requests
url = 'https://www.kaggle.com/api/v1/datasets/download/sazidthe1/sustainable-development-report'
r = requests.get(url)
with open('../datasets/sdgsdata.zip','wb') as fd:
fd.write(r.content)
import shutil
shutil.unpack_archive('../datasets/sdgsdata.zip','./datasets/')
path_2023 = "../datasets/sustainable_development_report_2023.csv"
path_index = "../datasets/sdg_index_2000-2022.csv"
# Read sutainable developement report 2023 data
import pandas as pd
df_sdg2023 = pd.read_csv(path_2023)
df_sdg2023
| country_code | country | region | overall_score | goal_1_score | goal_2_score | goal_3_score | goal_4_score | goal_5_score | goal_6_score | ... | goal_8_score | goal_9_score | goal_10_score | goal_11_score | goal_12_score | goal_13_score | goal_14_score | goal_15_score | goal_16_score | goal_17_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | FIN | Finland | OECD | 86.760595 | 99.5750 | 60.886750 | 95.386385 | 97.169333 | 92.11125 | 94.3276 | ... | 86.789000 | 95.995714 | 98.4685 | 91.233750 | 60.059571 | 68.793667 | 87.928000 | 85.0700 | 92.521091 | 75.601000 |
| 1 | SWE | Sweden | OECD | 85.981397 | 98.8885 | 63.074125 | 96.904000 | 99.761667 | 91.44025 | 95.0576 | ... | 84.966429 | 97.586286 | 94.9650 | 90.389250 | 56.830571 | 70.031000 | 69.348667 | 80.1882 | 88.508455 | 85.770250 |
| 2 | DNK | Denmark | OECD | 85.683637 | 99.2155 | 71.025250 | 95.398500 | 99.339667 | 86.99800 | 90.7316 | ... | 87.562429 | 96.984857 | 98.1560 | 93.038500 | 44.571714 | 60.780667 | 76.303333 | 92.7924 | 93.844909 | 82.148000 |
| 3 | DEU | Germany | OECD | 83.358447 | 99.5105 | 72.366000 | 93.039357 | 97.162667 | 81.92025 | 88.4434 | ... | 86.967286 | 95.788429 | 88.1470 | 90.096500 | 55.412857 | 64.002000 | 73.996000 | 79.2318 | 89.457545 | 84.390250 |
| 4 | AUT | Austria | OECD | 82.280189 | 99.4510 | 73.067500 | 92.468000 | 97.914333 | 84.57925 | 92.1636 | ... | 83.274143 | 96.982143 | 94.6345 | 92.473667 | 49.623286 | 57.332000 | NaN | 73.5836 | 87.911455 | 71.130250 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 161 | SOM | Somalia | Sub-Saharan Africa | 48.027231 | 11.2740 | 27.306833 | 17.860923 | 55.639000 | 25.86100 | 49.3006 | ... | 55.669500 | 5.599857 | 73.8030 | 69.417667 | 94.129000 | 99.925333 | 50.251200 | 53.3714 | 40.012200 | 43.725667 |
| 162 | YEM | Yemen, Rep. | MENA | 46.846980 | 4.5525 | 28.769714 | 44.467429 | 41.766750 | 13.05475 | 36.2314 | ... | 53.237800 | 14.223714 | 66.2925 | 52.952750 | 95.959143 | 98.667000 | 74.936667 | 48.3705 | 35.905714 | 50.094333 |
| 163 | TCD | Chad | Sub-Saharan Africa | 45.342321 | 25.4270 | 38.534714 | 27.061071 | 13.307750 | 30.83550 | 42.4036 | ... | 64.424333 | 9.631571 | 63.0405 | 32.822250 | 90.994167 | 99.079000 | NaN | 76.1944 | 29.438000 | 52.594000 |
| 164 | CAF | Central African Republic | Sub-Saharan Africa | 40.395839 | 3.1820 | 36.468000 | 12.952714 | 19.305750 | 34.20075 | 40.4204 | ... | 53.382333 | 7.064714 | 9.5775 | 21.898667 | 94.462143 | 99.463333 | NaN | 89.7172 | 42.332667 | 36.516750 |
| 165 | SSD | South Sudan | Sub-Saharan Africa | 38.676086 | 0.0000 | 19.805800 | 23.861714 | 1.232250 | 55.98875 | 41.0406 | ... | 50.917000 | 1.654833 | 26.6195 | 13.826250 | 90.960000 | 99.408000 | NaN | 74.6870 | 38.141167 | 41.622000 |
166 rows × 21 columns
# Drop 'region' column
df_sdg2023 = df_sdg2023.drop(columns=['region'])
# rename 'overall_score' to 'sdg_index_score'
sdgsdata2023 = df_sdg2023.rename(columns={'overall_score':'sdg_index_score'})
# add 'year' column and insert same value '2023'
sdgsdata2023['year'] = 2023
sdgsdata2023
| country_code | country | sdg_index_score | goal_1_score | goal_2_score | goal_3_score | goal_4_score | goal_5_score | goal_6_score | goal_7_score | ... | goal_9_score | goal_10_score | goal_11_score | goal_12_score | goal_13_score | goal_14_score | goal_15_score | goal_16_score | goal_17_score | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | FIN | Finland | 86.760595 | 99.5750 | 60.886750 | 95.386385 | 97.169333 | 92.11125 | 94.3276 | 93.01350 | ... | 95.995714 | 98.4685 | 91.233750 | 60.059571 | 68.793667 | 87.928000 | 85.0700 | 92.521091 | 75.601000 | 2023 |
| 1 | SWE | Sweden | 85.981397 | 98.8885 | 63.074125 | 96.904000 | 99.761667 | 91.44025 | 95.0576 | 97.97350 | ... | 97.586286 | 94.9650 | 90.389250 | 56.830571 | 70.031000 | 69.348667 | 80.1882 | 88.508455 | 85.770250 | 2023 |
| 2 | DNK | Denmark | 85.683637 | 99.2155 | 71.025250 | 95.398500 | 99.339667 | 86.99800 | 90.7316 | 87.73050 | ... | 96.984857 | 98.1560 | 93.038500 | 44.571714 | 60.780667 | 76.303333 | 92.7924 | 93.844909 | 82.148000 | 2023 |
| 3 | DEU | Germany | 83.358447 | 99.5105 | 72.366000 | 93.039357 | 97.162667 | 81.92025 | 88.4434 | 77.16175 | ... | 95.788429 | 88.1470 | 90.096500 | 55.412857 | 64.002000 | 73.996000 | 79.2318 | 89.457545 | 84.390250 | 2023 |
| 4 | AUT | Austria | 82.280189 | 99.4510 | 73.067500 | 92.468000 | 97.914333 | 84.57925 | 92.1636 | 86.03650 | ... | 96.982143 | 94.6345 | 92.473667 | 49.623286 | 57.332000 | NaN | 73.5836 | 87.911455 | 71.130250 | 2023 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 161 | SOM | Somalia | 48.027231 | 11.2740 | 27.306833 | 17.860923 | 55.639000 | 25.86100 | 49.3006 | 43.31575 | ... | 5.599857 | 73.8030 | 69.417667 | 94.129000 | 99.925333 | 50.251200 | 53.3714 | 40.012200 | 43.725667 | 2023 |
| 162 | YEM | Yemen, Rep. | 46.846980 | 4.5525 | 28.769714 | 44.467429 | 41.766750 | 13.05475 | 36.2314 | 36.91600 | ... | 14.223714 | 66.2925 | 52.952750 | 95.959143 | 98.667000 | 74.936667 | 48.3705 | 35.905714 | 50.094333 | 2023 |
| 163 | TCD | Chad | 45.342321 | 25.4270 | 38.534714 | 27.061071 | 13.307750 | 30.83550 | 42.4036 | 8.69700 | ... | 9.631571 | 63.0405 | 32.822250 | 90.994167 | 99.079000 | NaN | 76.1944 | 29.438000 | 52.594000 | 2023 |
| 164 | CAF | Central African Republic | 40.395839 | 3.1820 | 36.468000 | 12.952714 | 19.305750 | 34.20075 | 40.4204 | 19.44975 | ... | 7.064714 | 9.5775 | 21.898667 | 94.462143 | 99.463333 | NaN | 89.7172 | 42.332667 | 36.516750 | 2023 |
| 165 | SSD | South Sudan | 38.676086 | 0.0000 | 19.805800 | 23.861714 | 1.232250 | 55.98875 | 41.0406 | 11.39400 | ... | 1.654833 | 26.6195 | 13.826250 | 90.960000 | 99.408000 | NaN | 74.6870 | 38.141167 | 41.622000 | 2023 |
166 rows × 21 columns
# Read SDG index data from 2000-2022
df_index = pd.read_csv(path_index)
df_index
| country_code | country | year | sdg_index_score | goal_1_score | goal_2_score | goal_3_score | goal_4_score | goal_5_score | goal_6_score | ... | goal_8_score | goal_9_score | goal_10_score | goal_11_score | goal_12_score | goal_13_score | goal_14_score | goal_15_score | goal_16_score | goal_17_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Afghanistan | 2000 | 36.0 | 28.8 | 27.3 | 19.2 | 1.6 | 20.8 | 32.4 | ... | 38.5 | 5.2 | 0.0 | 25.8 | 94.7 | 99.4 | 0.0 | 51.9 | 39.2 | 34.2 |
| 1 | AFG | Afghanistan | 2001 | 36.3 | 28.8 | 30.6 | 19.4 | 1.6 | 20.8 | 32.4 | ... | 38.5 | 5.2 | 0.0 | 25.8 | 94.5 | 99.4 | 0.0 | 51.9 | 39.2 | 34.2 |
| 2 | AFG | Afghanistan | 2002 | 36.3 | 28.8 | 30.7 | 19.7 | 1.6 | 20.8 | 32.7 | ... | 38.4 | 5.2 | 0.0 | 26.1 | 94.1 | 99.4 | 0.0 | 51.8 | 39.2 | 34.2 |
| 3 | AFG | Afghanistan | 2003 | 36.7 | 28.8 | 32.5 | 19.9 | 1.6 | 20.8 | 33.0 | ... | 38.4 | 5.2 | 0.0 | 26.5 | 94.4 | 99.4 | 0.0 | 51.8 | 39.2 | 34.2 |
| 4 | AFG | Afghanistan | 2004 | 37.1 | 28.8 | 32.1 | 21.1 | 1.6 | 20.8 | 33.3 | ... | 38.5 | 5.2 | 0.0 | 26.8 | 94.8 | 99.4 | 0.0 | 51.8 | 39.2 | 34.2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4135 | ZWE | Zimbabwe | 2018 | 55.9 | 26.4 | 46.6 | 39.8 | 57.9 | 76.7 | 51.6 | ... | 63.8 | 23.5 | 32.4 | 79.2 | 94.6 | 94.1 | 0.0 | 76.4 | 45.5 | 45.6 |
| 4136 | ZWE | Zimbabwe | 2019 | 53.7 | 21.4 | 46.0 | 40.6 | 60.6 | 77.0 | 51.3 | ... | 62.8 | 23.8 | 24.0 | 58.6 | 94.6 | 94.2 | 0.0 | 76.4 | 44.0 | 44.5 |
| 4137 | ZWE | Zimbabwe | 2020 | 54.3 | 20.1 | 47.2 | 41.5 | 62.0 | 77.0 | 51.1 | ... | 62.9 | 25.4 | 24.0 | 60.0 | 94.6 | 94.3 | 0.0 | 76.4 | 44.6 | 46.0 |
| 4138 | ZWE | Zimbabwe | 2021 | 54.8 | 20.8 | 48.4 | 41.9 | 62.5 | 77.1 | 51.1 | ... | 63.1 | 27.0 | 24.0 | 61.2 | 94.5 | 94.3 | 0.0 | 77.9 | 43.1 | 48.1 |
| 4139 | ZWE | Zimbabwe | 2022 | 55.6 | 21.5 | 48.5 | 41.9 | 62.5 | 77.2 | 51.1 | ... | 63.2 | 27.0 | 24.0 | 65.7 | 94.5 | 94.3 | 0.0 | 77.9 | 40.5 | 47.8 |
4140 rows × 21 columns
# reindex sdgsdaga2023 with df_index
sdgsdata2023.reindex(df_index.index)
# Integrate both dataframe
sdgs_integrated = pd.concat([df_index,sdgsdata2023])
# fillin NaN as 0
sdgs_integrated = sdgs_integrated.fillna(0)
# Check the data with filtering the country name
sdgs_integrated.query('country == "Afghanistan"')
| country_code | country | year | sdg_index_score | goal_1_score | goal_2_score | goal_3_score | goal_4_score | goal_5_score | goal_6_score | ... | goal_8_score | goal_9_score | goal_10_score | goal_11_score | goal_12_score | goal_13_score | goal_14_score | goal_15_score | goal_16_score | goal_17_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Afghanistan | 2000 | 36.000000 | 28.8000 | 27.300000 | 19.200000 | 1.600 | 20.80000 | 32.400 | ... | 38.500 | 5.200000 | 0.0 | 25.800 | 94.700000 | 99.400000 | 0.0 | 51.9000 | 39.200000 | 34.200 |
| 1 | AFG | Afghanistan | 2001 | 36.300000 | 28.8000 | 30.600000 | 19.400000 | 1.600 | 20.80000 | 32.400 | ... | 38.500 | 5.200000 | 0.0 | 25.800 | 94.500000 | 99.400000 | 0.0 | 51.9000 | 39.200000 | 34.200 |
| 2 | AFG | Afghanistan | 2002 | 36.300000 | 28.8000 | 30.700000 | 19.700000 | 1.600 | 20.80000 | 32.700 | ... | 38.400 | 5.200000 | 0.0 | 26.100 | 94.100000 | 99.400000 | 0.0 | 51.8000 | 39.200000 | 34.200 |
| 3 | AFG | Afghanistan | 2003 | 36.700000 | 28.8000 | 32.500000 | 19.900000 | 1.600 | 20.80000 | 33.000 | ... | 38.400 | 5.200000 | 0.0 | 26.500 | 94.400000 | 99.400000 | 0.0 | 51.8000 | 39.200000 | 34.200 |
| 4 | AFG | Afghanistan | 2004 | 37.100000 | 28.8000 | 32.100000 | 21.100000 | 1.600 | 20.80000 | 33.300 | ... | 38.500 | 5.200000 | 0.0 | 26.800 | 94.800000 | 99.400000 | 0.0 | 51.8000 | 39.200000 | 34.200 |
| 5 | AFG | Afghanistan | 2005 | 37.500000 | 28.8000 | 35.900000 | 22.600000 | 1.600 | 20.80000 | 33.600 | ... | 38.400 | 5.200000 | 0.0 | 26.900 | 95.700000 | 99.400000 | 0.0 | 51.8000 | 39.200000 | 34.200 |
| 6 | AFG | Afghanistan | 2006 | 37.600000 | 28.8000 | 36.500000 | 22.700000 | 1.600 | 20.80000 | 33.900 | ... | 38.500 | 5.200000 | 0.0 | 27.300 | 95.900000 | 99.300000 | 0.0 | 51.8000 | 39.200000 | 34.700 |
| 7 | AFG | Afghanistan | 2007 | 38.000000 | 28.8000 | 39.500000 | 24.400000 | 1.600 | 21.00000 | 34.600 | ... | 38.500 | 5.200000 | 0.0 | 27.600 | 96.200000 | 99.300000 | 0.0 | 51.8000 | 39.200000 | 34.800 |
| 8 | AFG | Afghanistan | 2008 | 37.300000 | 28.8000 | 37.800000 | 25.900000 | 1.600 | 21.00000 | 35.700 | ... | 38.500 | 5.200000 | 0.0 | 28.000 | 96.300000 | 99.200000 | 0.0 | 51.8000 | 39.200000 | 36.000 |
| 9 | AFG | Afghanistan | 2009 | 38.300000 | 28.8000 | 43.000000 | 28.100000 | 1.600 | 20.80000 | 36.900 | ... | 38.500 | 5.400000 | 0.0 | 30.300 | 96.600000 | 99.100000 | 0.0 | 53.0000 | 39.900000 | 36.700 |
| 10 | AFG | Afghanistan | 2010 | 38.800000 | 28.8000 | 43.200000 | 31.400000 | 1.600 | 21.00000 | 38.000 | ... | 38.500 | 5.500000 | 0.0 | 29.900 | 96.300000 | 99.000000 | 0.0 | 53.0000 | 43.800000 | 36.600 |
| 11 | AFG | Afghanistan | 2011 | 38.400000 | 27.9000 | 42.000000 | 30.600000 | 1.600 | 21.00000 | 39.200 | ... | 38.500 | 5.600000 | 0.0 | 25.900 | 96.000000 | 98.800000 | 0.0 | 53.0000 | 43.000000 | 36.900 |
| 12 | AFG | Afghanistan | 2012 | 39.700000 | 30.5000 | 44.300000 | 30.400000 | 1.600 | 21.00000 | 40.400 | ... | 38.500 | 6.500000 | 0.0 | 31.400 | 96.100000 | 98.900000 | 0.0 | 53.0000 | 43.500000 | 35.300 |
| 13 | AFG | Afghanistan | 2013 | 39.700000 | 31.0000 | 44.200000 | 29.600000 | 1.600 | 21.30000 | 41.600 | ... | 38.500 | 6.600000 | 0.0 | 28.900 | 96.000000 | 99.000000 | 0.0 | 53.0000 | 44.300000 | 35.600 |
| 14 | AFG | Afghanistan | 2014 | 40.800000 | 30.7000 | 44.300000 | 30.900000 | 1.600 | 21.80000 | 42.900 | ... | 38.700 | 5.800000 | 0.0 | 33.600 | 96.300000 | 99.100000 | 0.0 | 53.0000 | 45.300000 | 36.000 |
| 15 | AFG | Afghanistan | 2015 | 41.600000 | 30.1000 | 45.000000 | 33.400000 | 17.500 | 22.40000 | 44.100 | ... | 39.100 | 6.400000 | 0.0 | 31.300 | 96.600000 | 99.100000 | 0.0 | 53.0000 | 44.300000 | 35.500 |
| 16 | AFG | Afghanistan | 2016 | 44.700000 | 29.8000 | 44.300000 | 34.400000 | 37.400 | 24.80000 | 45.400 | ... | 38.500 | 8.000000 | 0.0 | 36.000 | 96.500000 | 99.200000 | 0.0 | 56.5000 | 45.700000 | 38.200 |
| 17 | AFG | Afghanistan | 2017 | 45.600000 | 29.6000 | 43.500000 | 35.100000 | 38.900 | 27.10000 | 46.600 | ... | 39.900 | 8.800000 | 0.0 | 33.600 | 96.600000 | 99.100000 | 0.0 | 60.9000 | 45.500000 | 40.100 |
| 18 | AFG | Afghanistan | 2018 | 46.300000 | 29.1000 | 47.800000 | 36.900000 | 40.200 | 29.00000 | 47.900 | ... | 40.800 | 9.500000 | 0.0 | 32.800 | 96.700000 | 99.000000 | 0.0 | 60.9000 | 45.600000 | 42.100 |
| 19 | AFG | Afghanistan | 2019 | 46.400000 | 29.4000 | 47.000000 | 36.500000 | 42.000 | 30.80000 | 49.200 | ... | 40.600 | 9.700000 | 0.0 | 29.800 | 96.700000 | 99.000000 | 0.0 | 60.9000 | 46.100000 | 42.000 |
| 20 | AFG | Afghanistan | 2020 | 47.300000 | 27.9000 | 46.000000 | 37.600000 | 42.000 | 30.20000 | 50.400 | ... | 41.100 | 9.800000 | 0.0 | 30.200 | 96.700000 | 99.000000 | 0.0 | 73.2000 | 45.300000 | 44.100 |
| 21 | AFG | Afghanistan | 2021 | 46.300000 | 20.0000 | 46.700000 | 37.500000 | 34.400 | 32.90000 | 50.400 | ... | 39.500 | 9.700000 | 0.0 | 29.400 | 96.700000 | 99.000000 | 0.0 | 73.2000 | 44.600000 | 45.300 |
| 22 | AFG | Afghanistan | 2022 | 49.000000 | 12.4000 | 46.900000 | 37.500000 | 34.400 | 32.90000 | 50.400 | ... | 39.500 | 9.700000 | 0.0 | 24.500 | 96.700000 | 99.000000 | 0.0 | 73.2000 | 44.800000 | 45.300 |
| 157 | AFG | Afghanistan | 2023 | 49.005601 | 12.4105 | 46.915143 | 37.486714 | 34.366 | 32.92625 | 50.426 | ... | 39.535 | 9.724571 | 0.0 | 24.494 | 96.679857 | 99.034333 | 0.0 | 73.2382 | 44.796182 | 45.289 |
24 rows × 21 columns
import matplotlib.pyplot as plt
goal_def = [
{'color':'#0000AA','description':'SDGs Total Index'} , # total index
{'color':'#E5233D','description':'No Poverty'}, # Goal 1
{'color':'#DDA73A','description':'Zero Hunger'}, # Goal 2
{'color':'#4CA146','description':'Good Health and Well-being'}, # Goal 3
{'color':'#C5192D','description':'Quality Education'}, # Goal 4
{'color':'#FF3A21','description':'Gender Equality'}, # Goal 5
{'color':'#26BDE2','description':'Clean Water and sanitation'}, # Goal 6
{'color':'#FCC30B','description':'Affordable and Clean Energy'}, # Goal 7
{'color':'#A21942','description':'Decide Work And Economic Growth'}, # Goal 8
{'color':'#FD6925','description':'Industry, Innovation AND Infrastructure'}, # Goal 9
{'color':'#DD1367','description':'Reduce Inequality'}, # Goal 10
{'color':'#FD9D24','description':'Sustainable City and Community'},# Goal 11
{'color':'#BF8B2E','description':'Responsible Comsumption and Production'}, # Goal 12
{'color':'#3F7E44','description':'Climate Action'}, # Goal 13
{'color':'#0A97D9','description':'Life Below Water'}, # Goal 14
{'color':'#56C02B','description':'Life On Land'}, # Goal 15
{'color':'#00689D','description':'Peace, Justice and Strong Institution'}, # Goal 16
{'color':'#19486A','description':'Partnership for the Goal'} # Goal 17
]
columns = sdgs_integrated.columns
sdgs_labels = columns.to_list()
sdgs_labels.remove('country_code')
sdgs_labels.remove('year')
sdgs_labels.remove('country')
sdgs_def = []
for i,value in enumerate(sdgs_labels):
sdgs_def.append({'label':value,'color':goal_def[i]['color'],'description':goal_def[i]['description']})
sdgs_def
[{'label': 'sdg_index_score',
'color': '#0000AA',
'description': 'SDGs Total Index'},
{'label': 'goal_1_score', 'color': '#E5233D', 'description': 'No Poverty'},
{'label': 'goal_2_score', 'color': '#DDA73A', 'description': 'Zero Hunger'},
{'label': 'goal_3_score',
'color': '#4CA146',
'description': 'Good Health and Well-being'},
{'label': 'goal_4_score',
'color': '#C5192D',
'description': 'Quality Education'},
{'label': 'goal_5_score',
'color': '#FF3A21',
'description': 'Gender Equality'},
{'label': 'goal_6_score',
'color': '#26BDE2',
'description': 'Clean Water and sanitation'},
{'label': 'goal_7_score',
'color': '#FCC30B',
'description': 'Affordable and Clean Energy'},
{'label': 'goal_8_score',
'color': '#A21942',
'description': 'Decide Work And Economic Growth'},
{'label': 'goal_9_score',
'color': '#FD6925',
'description': 'Industry, Innovation AND Infrastructure'},
{'label': 'goal_10_score',
'color': '#DD1367',
'description': 'Reduce Inequality'},
{'label': 'goal_11_score',
'color': '#FD9D24',
'description': 'Sustainable City and Community'},
{'label': 'goal_12_score',
'color': '#BF8B2E',
'description': 'Responsible Comsumption and Production'},
{'label': 'goal_13_score',
'color': '#3F7E44',
'description': 'Climate Action'},
{'label': 'goal_14_score',
'color': '#0A97D9',
'description': 'Life Below Water'},
{'label': 'goal_15_score', 'color': '#56C02B', 'description': 'Life On Land'},
{'label': 'goal_16_score',
'color': '#00689D',
'description': 'Peace, Justice and Strong Institution'},
{'label': 'goal_17_score',
'color': '#19486A',
'description': 'Partnership for the Goal'}]
statdf = sdgs_integrated
goal_cols = [f"goal_{i}_score" for i in range(1,18)]
goal_cols
['goal_1_score', 'goal_2_score', 'goal_3_score', 'goal_4_score', 'goal_5_score', 'goal_6_score', 'goal_7_score', 'goal_8_score', 'goal_9_score', 'goal_10_score', 'goal_11_score', 'goal_12_score', 'goal_13_score', 'goal_14_score', 'goal_15_score', 'goal_16_score', 'goal_17_score']
OK, let's start from linear regression with OLS(Ordinary Least Square) with setting "sdg_index_score" as dependency variable.
This time, I use the library of statemodels
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
Then, set goal_1_index ~ goal_17_index as explanatory variables "X", and set sdg_goal_index as response variable "y"
X = statdf[goal_cols].copy()
y = statdf["sdg_index_score"]
X.shape,y.shape
((4306, 17), (4306,))
OK, use OLS in statsmodel library and get the result.
X_const = sm.add_constant(X)
model = sm.OLS(y,X_const).fit()
print(model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: sdg_index_score R-squared: 0.989
Model: OLS Adj. R-squared: 0.989
Method: Least Squares F-statistic: 2.305e+04
Date: Sun, 30 Nov 2025 Prob (F-statistic): 0.00
Time: 05:05:59 Log-Likelihood: -6604.4
No. Observations: 4306 AIC: 1.324e+04
Df Residuals: 4288 BIC: 1.336e+04
Df Model: 17
Covariance Type: nonrobust
=================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------
const 4.4003 0.316 13.905 0.000 3.780 5.021
goal_1_score 0.0136 0.001 18.516 0.000 0.012 0.015
goal_2_score 0.0774 0.002 34.007 0.000 0.073 0.082
goal_3_score 0.1071 0.002 44.693 0.000 0.102 0.112
goal_4_score 0.0691 0.001 49.720 0.000 0.066 0.072
goal_5_score 0.0396 0.002 24.283 0.000 0.036 0.043
goal_6_score 0.0555 0.002 24.770 0.000 0.051 0.060
goal_7_score 0.0809 0.002 45.247 0.000 0.077 0.084
goal_8_score 0.0535 0.003 19.538 0.000 0.048 0.059
goal_9_score 0.0393 0.001 26.939 0.000 0.036 0.042
goal_10_score 0.0462 0.001 66.507 0.000 0.045 0.048
goal_11_score 0.0574 0.002 29.641 0.000 0.054 0.061
goal_12_score 0.0627 0.003 22.134 0.000 0.057 0.068
goal_13_score 0.0298 0.002 18.926 0.000 0.027 0.033
goal_14_score 0.0071 0.001 10.886 0.000 0.006 0.008
goal_15_score 0.0711 0.001 49.270 0.000 0.068 0.074
goal_16_score 0.0637 0.002 28.688 0.000 0.059 0.068
goal_17_score 0.0677 0.002 40.144 0.000 0.064 0.071
==============================================================================
Omnibus: 442.196 Durbin-Watson: 0.204
Prob(Omnibus): 0.000 Jarque-Bera (JB): 689.748
Skew: 0.752 Prob(JB): 1.67e-150
Kurtosis: 4.257 Cond. No. 4.88e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.88e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
All goal scores p-value is p<0.05, so it has a statistically significant effect.
Looking at the “coef” values, it suggests that Goal 3 has the highest contribution (an increase in Goal 3's score leads to the largest increase in the overall SDG Index score). Goal 12 comes next, followed by Goal 7.
Also, checked Variance Inflation Factor (VIF).
import numpy as np
X_vif = X_const.values
vif_list = []
for i in range(X_vif.shape[1]):
vif = variance_inflation_factor(X_vif,i)
vif_list.append(vif)
vif_df = pd.DataFrame({
"feature": X_const.columns,
"VIF": vif_list
})
vif_df
| feature | VIF | |
|---|---|---|
| 0 | const | 341.326363 |
| 1 | goal_1_score | 2.394855 |
| 2 | goal_2_score | 2.178459 |
| 3 | goal_3_score | 9.727456 |
| 4 | goal_4_score | 4.602000 |
| 5 | goal_5_score | 2.716097 |
| 6 | goal_6_score | 4.020126 |
| 7 | goal_7_score | 5.013481 |
| 8 | goal_8_score | 2.571580 |
| 9 | goal_9_score | 4.912090 |
| 10 | goal_10_score | 1.612300 |
| 11 | goal_11_score | 4.214480 |
| 12 | goal_12_score | 7.622557 |
| 13 | goal_13_score | 3.843774 |
| 14 | goal_14_score | 1.156706 |
| 15 | goal_15_score | 1.283671 |
| 16 | goal_16_score | 3.579412 |
| 17 | goal_17_score | 1.651957 |
Goal 3 is also high VIF value here....
coef = model.params.drop("const")
coef_sorted = coef.sort_values(ascending=False)
coef_sorted
goal_3_score 0.107058 goal_7_score 0.080873 goal_2_score 0.077405 goal_15_score 0.071142 goal_4_score 0.069079 goal_17_score 0.067698 goal_16_score 0.063665 goal_12_score 0.062666 goal_11_score 0.057354 goal_6_score 0.055526 goal_8_score 0.053488 goal_10_score 0.046224 goal_5_score 0.039615 goal_9_score 0.039253 goal_13_score 0.029791 goal_1_score 0.013587 goal_14_score 0.007143 dtype: float64
Moreover... checked the correlation with Pearson correlation coefficient,
from scipy.stats import pearsonr
pearsonr_results = {}
for col in goal_cols:
rho,pval = pearsonr(statdf[col],statdf["sdg_index_score"])
pearsonr_results[col] = {"rho": rho, "p-value": pval}
pr_df = pd.DataFrame(pearsonr_results).T
pr_df.sort_values("rho",ascending=False)
pr_df
| rho | p-value | |
|---|---|---|
| goal_1_score | 0.720406 | 0.000000e+00 |
| goal_2_score | 0.680498 | 0.000000e+00 |
| goal_3_score | 0.915858 | 0.000000e+00 |
| goal_4_score | 0.862357 | 0.000000e+00 |
| goal_5_score | 0.713507 | 0.000000e+00 |
| goal_6_score | 0.862810 | 0.000000e+00 |
| goal_7_score | 0.877140 | 0.000000e+00 |
| goal_8_score | 0.734806 | 0.000000e+00 |
| goal_9_score | 0.816256 | 0.000000e+00 |
| goal_10_score | 0.441650 | 4.452731e-205 |
| goal_11_score | 0.835802 | 0.000000e+00 |
| goal_12_score | -0.693377 | 0.000000e+00 |
| goal_13_score | -0.485065 | 4.859978e-253 |
| goal_14_score | 0.151990 | 1.136253e-23 |
| goal_15_score | 0.252971 | 7.443630e-64 |
| goal_16_score | 0.761106 | 0.000000e+00 |
| goal_17_score | 0.603774 | 0.000000e+00 |
I also checked the correlation with Spearman's rank correlation coefficient
from scipy.stats import spearmanr
spearman_results = {}
for col in goal_cols:
rho,pval = spearmanr(statdf[col],statdf["sdg_index_score"])
spearman_results[col] = {"rho": rho, "p-value": pval}
sp_df = pd.DataFrame(spearman_results).T
sp_df.sort_values("rho",ascending=False)
sp_df
| rho | p-value | |
|---|---|---|
| goal_1_score | 0.766965 | 0.000000e+00 |
| goal_2_score | 0.669205 | 0.000000e+00 |
| goal_3_score | 0.902713 | 0.000000e+00 |
| goal_4_score | 0.853403 | 0.000000e+00 |
| goal_5_score | 0.701933 | 0.000000e+00 |
| goal_6_score | 0.873589 | 0.000000e+00 |
| goal_7_score | 0.862647 | 0.000000e+00 |
| goal_8_score | 0.743441 | 0.000000e+00 |
| goal_9_score | 0.859014 | 0.000000e+00 |
| goal_10_score | 0.493456 | 3.889951e-263 |
| goal_11_score | 0.820494 | 0.000000e+00 |
| goal_12_score | -0.776105 | 0.000000e+00 |
| goal_13_score | -0.736678 | 0.000000e+00 |
| goal_14_score | 0.102578 | 1.509305e-11 |
| goal_15_score | 0.260884 | 6.161587e-68 |
| goal_16_score | 0.763935 | 0.000000e+00 |
| goal_17_score | 0.618478 | 0.000000e+00 |
Then, visualized above results as follow.
Here is the bar chart of Effect of each SDG Goal on SDG index from the result of OLS Coefficients).
coef_sorted = model.params.drop("const").sort_values()
plt.figure(figsize=(10,6))
coef_sorted.plot(kind="barh")
plt.title("Effect of Each SDG Goal on SDG Index (OLS Coefficients)")
plt.xlabel("Coefficient")
plt.grid(axis="x")
plt.show()
Here is the bar chart of VIF result
plt.figure(figsize=(10,5))
plt.bar(vif_df["feature"],vif_df["VIF"])
plt.xticks(rotation=90)
plt.ylabel("VIF")
plt.title("Variance Inflation Factor (Multicollinearity)")
plt.grid(axis="y")
plt.tight_layout()
plt.show()
Here is a heat map of Correlation Between SDG Goals with Pearson correlation coefficient.
import seaborn as sns
plt.figure(figsize=(10,8))
sns.heatmap(statdf[goal_cols].corr(), annot=False, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation Between SDG Goals")
plt.show()
Here is the heatmap of Spearman Correlation.
import seaborn as sns
import matplotlib.pyplot as plt
spearman_matrix = statdf[goal_cols + ["sdg_index_score"]].corr(method="spearman")
plt.figure(figsize=(12,8))
sns.heatmap(spearman_matrix, cmap="coolwarm", annot=False, vmin=-1, vmax=1)
plt.title("Spearman Correlation Between SDG Goals and SDG Index")
plt.show()
import numpy as np
y_pred = model.predict(X_const)
plt.figure(figsize=(6,6))
plt.scatter(y, y_pred, alpha=0.3)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--') # 45-degree line
plt.xlabel("Actual SDG Index")
plt.ylabel("Predicted SDG Index")
plt.title("Predicted vs Actual SDG Index (OLS)")
plt.grid(True)
plt.show()
residuals = model.resid
fitted = model.fittedvalues
plt.figure(figsize=(8,5))
plt.scatter(fitted, residuals, alpha=0.3)
plt.axhline(0, color="red")
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.title("Residual Plot (OLS)")
plt.grid(True)
plt.show()