< Home
Assignment 02¶
- Visualize your data set(s)
Prepare the environment in Jupyter¶
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
path = "datasets/Enaho01A-2025-300.csv"
df = pd.read_csv(path, encoding="latin-1")
In [3]:
df.shape
df.columns[:20]
Out[3]:
Index(['AÑO', 'MES', 'CONGLOME', 'VIVIENDA', 'HOGAR', 'CODPERSO', 'UBIGEO',
'DOMINIO', 'ESTRATO', 'CODINFOR', 'P300N', 'P300I', 'P300A', 'P301A',
'P301B', 'P301C', 'P301D', 'P301A0', 'P301A1', 'P301B0'],
dtype='object')
In [4]:
df.head()
Out[4]:
| AÑO | MES | CONGLOME | VIVIENDA | HOGAR | CODPERSO | UBIGEO | DOMINIO | ESTRATO | CODINFOR | ... | I311D$2 | I311D$3 | I311D$4 | I311D$5 | I311D$6 | I311D$7 | I3121C | I3122C | I315B | FACTOR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2025 | 1 | 15009 | 12 | 11 | 1 | 10101 | 4 | 4 | 1 | ... | 23 | 333.512304 | ||||||||
| 1 | 2025 | 1 | 15009 | 12 | 11 | 2 | 10101 | 4 | 4 | 2 | ... | 380.999876 | |||||||||
| 2 | 2025 | 1 | 15009 | 25 | 11 | 1 | 10101 | 4 | 4 | 1 | ... | 495.705947 | |||||||||
| 3 | 2025 | 1 | 15009 | 25 | 11 | 2 | 10101 | 4 | 4 | 2 | ... | 426.265173 | |||||||||
| 4 | 2025 | 1 | 15009 | 25 | 11 | 3 | 10101 | 4 | 4 | 3 | ... | 380.999876 |
5 rows × 491 columns
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 24070 entries, 0 to 24069 Columns: 491 entries, AÑO to FACTOR dtypes: float64(1), int64(19), object(471) memory usage: 90.2+ MB
In [6]:
df.describe()
Out[6]:
| AÑO | MES | CONGLOME | VIVIENDA | HOGAR | CODPERSO | UBIGEO | DOMINIO | ESTRATO | CODINFOR | P300N | P300I | P300A | P301A | P203 | P204 | P207 | P208A | TICUEST01A | FACTOR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 24070.0 | 24070.000000 | 24070.000000 | 24070.000000 | 24070.000000 | 24070.000000 | 24070.000000 | 24070.00000 | 24070.000000 | 24070.000000 | 24070.000000 | 24070.000000 | 24070.000000 | 24070.000000 | 24070.000000 | 24070.000000 | 24070.000000 | 24070.000000 | 24070.0 | 24070.000000 |
| mean | 2025.0 | 2.006606 | 17766.863980 | 79.493062 | 11.150519 | 2.470710 | 130352.322642 | 4.85646 | 4.194267 | 1.902700 | 2.470710 | 1.902700 | 5.986207 | 7.521978 | 2.504279 | 1.004030 | 1.514666 | 37.076651 | 2.0 | 1402.106879 |
| std | 0.0 | 0.820961 | 1627.265031 | 73.467597 | 1.404687 | 1.511061 | 67535.209034 | 2.41155 | 2.436749 | 1.208535 | 1.511061 | 1.208535 | 14.909601 | 14.787570 | 1.699042 | 0.063355 | 0.499795 | 22.669416 | 0.0 | 1296.668961 |
| min | 2025.0 | 1.000000 | 15005.000000 | 1.000000 | 11.000000 | 1.000000 | 10101.000000 | 1.00000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 3.000000 | 2.0 | 39.102612 |
| 25% | 2025.0 | 1.000000 | 16389.000000 | 31.000000 | 11.000000 | 1.000000 | 70106.000000 | 2.00000 | 2.000000 | 1.000000 | 1.000000 | 1.000000 | 4.000000 | 3.000000 | 1.000000 | 1.000000 | 1.000000 | 16.000000 | 2.0 | 629.746563 |
| 50% | 2025.0 | 2.000000 | 17697.000000 | 66.000000 | 11.000000 | 2.000000 | 140102.000000 | 5.00000 | 4.000000 | 2.000000 | 2.000000 | 2.000000 | 4.000000 | 5.000000 | 2.000000 | 1.000000 | 2.000000 | 35.000000 | 2.0 | 977.814978 |
| 75% | 2025.0 | 3.000000 | 19176.000000 | 107.000000 | 11.000000 | 3.000000 | 180106.000000 | 7.00000 | 7.000000 | 2.000000 | 3.000000 | 2.000000 | 4.000000 | 6.000000 | 3.000000 | 1.000000 | 2.000000 | 55.000000 | 2.0 | 1640.860425 |
| max | 2025.0 | 3.000000 | 20804.000000 | 871.000000 | 45.000000 | 13.000000 | 250304.000000 | 8.00000 | 8.000000 | 12.000000 | 13.000000 | 12.000000 | 99.000000 | 99.000000 | 11.000000 | 2.000000 | 2.000000 | 98.000000 | 2.0 | 8758.553932 |
In [12]:
map_edu = {
1: "No level of education",
2: "Early childhood education",
3: "Incomplete primary education",
4: "Complete primary education",
5: "Incomplete secondary education",
6: "Complete secondary education",
7: "Incomplete non-university higher education",
8: "Complete non-university higher education",
9: "Incomplete university higher education",
10: "Complete university higher education",
11: "Master's/Doctoral degree",
12: "Special basic education"
}
df["level_ed"] = df["P301A"].map(map_edu)
In [13]:
df["region"] = df["UBIGEO"].astype(str).str.zfill(6).str[:2]
In [14]:
regions = {
"01": "Amazonas", "02": "Áncash", "03": "Apurímac", "04": "Arequipa",
"05": "Ayacucho", "06": "Cajamarca", "07": "Callao", "08": "Cusco",
"09": "Huancavelica", "10": "Huánuco", "11": "Ica", "12": "Junín",
"13": "La Libertad", "14": "Lambayeque", "15": "Lima",
"16": "Loreto", "17": "Madre de Dios", "18": "Moquegua",
"19": "Pasco", "20": "Piura", "21": "Puno", "22": "San Martín",
"23": "Tacna", "24": "Tumbes", "25": "Ucayali"
}
df["region"] = df["region"].map(regions)
In [15]:
table_reg = pd.crosstab(df["level_ed"], df["region"], normalize="columns")*100
table_reg
Out[15]:
| region | Amazonas | Apurímac | Arequipa | Ayacucho | Cajamarca | Callao | Cusco | Huancavelica | Huánuco | Ica | ... | Madre de Dios | Moquegua | Pasco | Piura | Puno | San Martín | Tacna | Tumbes | Ucayali | Áncash |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| level_ed | |||||||||||||||||||||
| Complete non-university higher education | 6.165228 | 3.267974 | 9.278351 | 4.046997 | 5.297297 | 11.094675 | 6.306306 | 3.021148 | 4.081633 | 9.440880 | ... | 6.265664 | 15.192744 | 4.797048 | 6.239316 | 3.697749 | 6.042296 | 8.016878 | 7.733813 | 3.954214 | 5.440901 |
| Complete primary education | 16.646116 | 11.111111 | 8.247423 | 12.402089 | 17.729730 | 8.136095 | 9.523810 | 12.688822 | 10.204082 | 7.516040 | ... | 11.528822 | 9.523810 | 16.051661 | 12.735043 | 13.826367 | 16.515609 | 7.313643 | 7.553957 | 11.030177 | 12.101313 |
| Complete secondary education | 14.673243 | 16.666667 | 24.554827 | 15.143603 | 15.675676 | 29.437870 | 18.146718 | 18.731118 | 14.285714 | 26.764436 | ... | 19.047619 | 19.954649 | 19.557196 | 19.487179 | 22.347267 | 16.213494 | 31.223629 | 25.000000 | 19.250780 | 20.075047 |
| Complete university higher education | 2.589396 | 4.084967 | 8.716026 | 5.613577 | 2.270270 | 7.396450 | 5.920206 | 3.172205 | 5.048335 | 9.074244 | ... | 5.012531 | 8.616780 | 4.243542 | 4.444444 | 4.180064 | 4.632427 | 7.594937 | 4.316547 | 3.537981 | 5.816135 |
| Early childhood education | 4.932182 | 2.941176 | 4.592315 | 4.830287 | 5.081081 | 3.402367 | 2.831403 | 3.323263 | 3.974221 | 4.307974 | ... | 5.513784 | 2.947846 | 4.981550 | 5.641026 | 2.250804 | 3.726083 | 2.812940 | 4.856115 | 5.411030 | 3.377111 |
| Incomplete non-university higher education | 2.589396 | 1.797386 | 4.404873 | 2.219321 | 1.621622 | 6.360947 | 2.574003 | 1.963746 | 2.363050 | 5.041247 | ... | 3.258145 | 3.174603 | 2.583026 | 3.846154 | 3.697749 | 2.416918 | 4.078762 | 2.877698 | 4.578564 | 2.814259 |
| Incomplete primary education | 26.880395 | 23.856209 | 14.714152 | 24.934726 | 24.972973 | 11.834320 | 21.364221 | 25.377644 | 26.638024 | 13.198900 | ... | 19.298246 | 12.698413 | 21.771218 | 21.196581 | 23.151125 | 23.967774 | 14.205345 | 20.683453 | 21.540062 | 20.919325 |
| Incomplete secondary education | 15.659679 | 15.686275 | 14.245548 | 13.577023 | 14.054054 | 12.278107 | 15.444015 | 15.407855 | 14.930183 | 13.932172 | ... | 17.293233 | 15.192744 | 14.391144 | 16.495726 | 13.183280 | 15.709970 | 12.376934 | 17.625899 | 20.603538 | 15.290807 |
| Incomplete university higher education | 3.082614 | 4.738562 | 5.998126 | 4.569191 | 1.513514 | 5.029586 | 4.375804 | 2.870091 | 5.155747 | 6.874427 | ... | 4.761905 | 5.215420 | 3.690037 | 2.735043 | 4.823151 | 5.035247 | 5.766526 | 5.215827 | 3.642040 | 5.065666 |
| Master's/Doctoral degree | 0.493218 | 0.326797 | 2.061856 | 0.783290 | 0.540541 | 1.479290 | 1.801802 | 0.453172 | 3.222342 | 1.008249 | ... | 1.002506 | 2.040816 | 0.369004 | 0.769231 | 1.125402 | 1.007049 | 1.406470 | 0.719424 | 0.416233 | 1.594747 |
| No level of education | 6.165228 | 15.522876 | 3.186504 | 11.879896 | 11.243243 | 3.106509 | 11.583012 | 12.839879 | 10.096670 | 2.199817 | ... | 7.017544 | 5.442177 | 7.564576 | 6.324786 | 7.717042 | 4.531722 | 4.922644 | 3.237410 | 6.035380 | 7.223265 |
| Special basic education | 0.123305 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.443787 | 0.128700 | 0.151057 | 0.000000 | 0.641613 | ... | 0.000000 | 0.000000 | 0.000000 | 0.085470 | 0.000000 | 0.201410 | 0.281294 | 0.179856 | 0.000000 | 0.281426 |
12 rows × 25 columns
In [18]:
plt.figure(figsize=(14,7))
table_reg.T.plot(kind="bar", stacked=True, figsize=(14,7))
plt.title("Educational level attained by region")
plt.ylabel("Percentage (%)")
plt.xlabel("Region")
plt.xticks(rotation=90)
plt.legend(title="Educational level", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
<Figure size 1400x700 with 0 Axes>
In [19]:
import seaborn as sns
plt.figure(figsize=(14,8))
sns.heatmap(table_reg, cmap="YlGnBu", linewidths=.5)
plt.title("Educational level heatmap by region")
plt.xlabel("Educational level")
plt.ylabel("Region")
plt.show()
In [ ]: