import os

os.listdir("/home")

['jovyan']

os.listdir("/home/jovyan")

['.bash_history',
 '.ssh',
 '.lesshst',
 '.profile',
 '.mamba',
 '.bashrc',
 '.local',
 '.cache',
 'work',
 '.bash_logout',
 '.config',
 '.ipython',
 '.wget-hsts',
 '.jupyter']

import os
os.listdir()

['about.ipynb',
 '07_Transforms.ipynb',
 '02_Tools.ipynb',
 'images',
 'Tests.ipynb',
 '.git',
 'home.ipynb',
 'Assignments',
 '04_ML.ipynb',
 '.gitlab-ci.yml',
 '.ipynb_checkpoints',
 'datasets',
 '01_Introduction.ipynb',
 '05_Probability.ipynb',
 'submission_jax.csv',
 'submission.csv',
 '03_Fitting.ipynb',
 'theme',
 '06_Density_Est.ipynb',
 'presentation.ipynb',
 '.gitignore',
 'README.md']

os.listdir("datasets")

['Titanic_test.csv',
 'TENBIARE.csv',
 'enbiare_2021_fd.xlsx',
 'historico_accidentes.csv',
 'Titanic_train.csv',
 '.ipynb_checkpoints',
 'denue_inegi_21_.csv',
 'submission.csv',
 '.gitignore']

import os
os.getcwd()

'/home/jovyan/work/aristarco-cortes'

import os

for root, dirs, files in os.walk("/", topdown=True):
    if "TENBIARE.csv" in files:
        print("ENCONTRADO EN:", os.path.join(root, "TENBIARE.csv"))
        break

ENCONTRADO EN: /home/jovyan/work/aristarco-cortes/datasets/TENBIARE.csv

from charset_normalizer import from_path

ruta = "/home/jovyan/work/aristarco-cortes/datasets/TENBIARE.csv"

resultado = from_path(ruta).best()
print(resultado.encoding)

utf_8

import pandas as pd

ruta = "/home/jovyan/work/aristarco-cortes/datasets/TENBIARE.csv"

df = pd.read_csv(ruta, encoding="utf-8")

df.head()

df.shape

(31166, 284)

df.columns.tolist()

['FOLIO',
 'VIV_SEL',
 'HOGAR',
 'N_REN',
 'PA1',
 'PA2',
 'PA3_01',
 'PA3_02',
 'PA3_03',
 'PA3_04',
 'PA3_05',
 'PA3_06',
 'PA3_07',
 'PA3_08',
 'PA3_09',
 'PA3_10',
 'PA3_11',
 'PA3_12',
 'PA3_13',
 'PA3_14',
 'PA3_15',
 'PA3_16',
 'PA3_17',
 'PA4_01',
 'PA4_02',
 'PA4_03',
 'PA4_04',
 'PA4_05',
 'PA4_06',
 'PA4_07',
 'PA4_08',
 'PA4_09',
 'PA4_10',
 'PA5',
 'PA6',
 'PB1_01',
 'PB1_02',
 'PB1_03',
 'PB1_04',
 'PB1_05',
 'PB1_06',
 'PB1_07',
 'PB1_08',
 'PB1_09',
 'PB1_10',
 'PB1_11',
 'PB1_12',
 'PB2_1',
 'PB2_2',
 'PC1_1',
 'PC1_2',
 'PC1_3',
 'PC1_4',
 'PC1_5',
 'PC1_6',
 'PC1_7',
 'PC2_1',
 'PC2_2',
 'PC2_3',
 'PC2_4',
 'PC2_5',
 'PC3_1',
 'PC3_2',
 'PC3_3',
 'PC3_4',
 'PC3_5',
 'PC3_6',
 'PC3_7',
 'PC3_8',
 'PC4_1',
 'PC4_2',
 'PD1_1',
 'PD1_2',
 'PD1_3',
 'PD1_4',
 'PD1_5',
 'PD1_6',
 'PD2_1',
 'PD2_2',
 'PD2_3',
 'PD2_4',
 'PD2_5',
 'PD2_6',
 'PD2_7',
 'PD3_1',
 'PD3_2',
 'PE1',
 'PE2',
 'PE3',
 'PE4',
 'PE5',
 'PE6',
 'PE7',
 'PE8',
 'PE9',
 'PE10_1',
 'PE10_2',
 'PE10_3',
 'PE10_4',
 'PE10_5',
 'PE10_6',
 'PE10_7',
 'PE10_8',
 'PE11',
 'PE12',
 'PE13_1',
 'PE13_2',
 'PE14',
 'PE15',
 'PE16',
 'PE17',
 'PE18_1',
 'PE18_2',
 'PF1_1',
 'PF1_2',
 'PF1_3',
 'PF1_4',
 'PF1_5',
 'PF1_6',
 'PF2',
 'PF3',
 'PF4_1',
 'PF4_2',
 'PF4_3',
 'PF4_4',
 'PF4_5',
 'PF4_6',
 'PF4_7',
 'PF4_8',
 'PF4_9',
 'PF5_1',
 'PF5_2',
 'PF5_3',
 'PF6_1',
 'PF6_2',
 'PF6_3',
 'PF6_4',
 'PF6_5',
 'PF6_6',
 'PF6_7',
 'PF6_8',
 'PF7_01',
 'PF8_01',
 'PF7_02',
 'PF8_02',
 'PF7_03',
 'PF8_03',
 'PF7_04',
 'PF8_04',
 'PF7_05',
 'PF8_05',
 'PF7_06',
 'PF8_06',
 'PF7_07',
 'PF8_07',
 'PF7_08',
 'PF8_08',
 'PF7_09',
 'PF8_09',
 'PF7_10',
 'PF8_10',
 'PF7_11',
 'PF8_11',
 'PF7_12',
 'PF8_12',
 'PF7_13',
 'PF8_13',
 'PF7_14',
 'PF8_14',
 'PF7_15',
 'PF8_15',
 'PF7_16',
 'PF8_16',
 'PF9_01',
 'PF9_02',
 'PF9_03',
 'PF9_04',
 'PF9_05',
 'PF9_06',
 'PF9_07',
 'PF9_08',
 'PF9_09',
 'PF9_10',
 'PF10_1',
 'PF10_2',
 'PF10_3',
 'PF10_4',
 'PF10_5',
 'PF10_6',
 'PF10_7',
 'PF11_1',
 'PF11_2',
 'PF11_3',
 'PF11_4',
 'PF12_1',
 'PF12_2',
 'PF12_3',
 'PF12_4',
 'PF13_1',
 'PF13_2',
 'PF13_3',
 'PF13_4',
 'PF14_1_ESP',
 'PF14_2_ESP',
 'PF14_3_ESP',
 'PG1_1',
 'PG1_2',
 'PG1_3',
 'PG1_4',
 'PG1_5',
 'PG1_6',
 'PG1_7',
 'PG1_8',
 'PG2',
 'PG3_1',
 'PG3_2',
 'PG3_3',
 'PG3_4',
 'PG4_1',
 'PG4_2',
 'PG4_3',
 'PG5_1_1',
 'PG5_2_1',
 'PG5_3_1',
 'PG6',
 'PG7',
 'PG8',
 'PH1_1',
 'PH1_2',
 'PH1_3',
 'PH1_4',
 'PH2_1',
 'PH2_2',
 'PH2_3',
 'PH2_4',
 'PH3',
 'PH4',
 'PH5',
 'PH6_1',
 'PH6_2',
 'PH6_3',
 'PH7_1',
 'PH7_2',
 'PH8',
 'PI1',
 'PI2',
 'PI3',
 'PI4',
 'PI5',
 'PJ1',
 'PJ2_1',
 'PJ2_2',
 'PJ2_3',
 'PJ2_4',
 'PJ2_5',
 'PJ2_6',
 'PJ2_7',
 'PJ2_8',
 'PJ3_1',
 'PJ3_2',
 'PJ3_3',
 'PJ3_4',
 'PJ4',
 'PJ5',
 'PJ6_01',
 'PJ6_02',
 'PJ6_03',
 'PJ6_04',
 'PJ6_05',
 'PJ6_06',
 'PJ6_07',
 'PJ6_08',
 'PJ6_09',
 'PJ6_10',
 'PJ6_11',
 'PJ6_12',
 'PJ6_13',
 'PJ7_1',
 'PJ7_2',
 'ENT',
 'TLOC',
 'FAC_ELE',
 'EST_DIS',
 'UPM_DIS']

df.dtypes

FOLIO      int64
VIV_SEL    int64
HOGAR      int64
N_REN      int64
PA1        int64
           ...  
ENT        int64
TLOC       int64
FAC_ELE    int64
EST_DIS    int64
UPM_DIS    int64
Length: 284, dtype: object

df = pd.read_csv(ruta, encoding="utf-8")

print(df.columns.tolist())

['FOLIO', 'VIV_SEL', 'HOGAR', 'N_REN', 'PA1', 'PA2', 'PA3_01', 'PA3_02', 'PA3_03', 'PA3_04', 'PA3_05', 'PA3_06', 'PA3_07', 'PA3_08', 'PA3_09', 'PA3_10', 'PA3_11', 'PA3_12', 'PA3_13', 'PA3_14', 'PA3_15', 'PA3_16', 'PA3_17', 'PA4_01', 'PA4_02', 'PA4_03', 'PA4_04', 'PA4_05', 'PA4_06', 'PA4_07', 'PA4_08', 'PA4_09', 'PA4_10', 'PA5', 'PA6', 'PB1_01', 'PB1_02', 'PB1_03', 'PB1_04', 'PB1_05', 'PB1_06', 'PB1_07', 'PB1_08', 'PB1_09', 'PB1_10', 'PB1_11', 'PB1_12', 'PB2_1', 'PB2_2', 'PC1_1', 'PC1_2', 'PC1_3', 'PC1_4', 'PC1_5', 'PC1_6', 'PC1_7', 'PC2_1', 'PC2_2', 'PC2_3', 'PC2_4', 'PC2_5', 'PC3_1', 'PC3_2', 'PC3_3', 'PC3_4', 'PC3_5', 'PC3_6', 'PC3_7', 'PC3_8', 'PC4_1', 'PC4_2', 'PD1_1', 'PD1_2', 'PD1_3', 'PD1_4', 'PD1_5', 'PD1_6', 'PD2_1', 'PD2_2', 'PD2_3', 'PD2_4', 'PD2_5', 'PD2_6', 'PD2_7', 'PD3_1', 'PD3_2', 'PE1', 'PE2', 'PE3', 'PE4', 'PE5', 'PE6', 'PE7', 'PE8', 'PE9', 'PE10_1', 'PE10_2', 'PE10_3', 'PE10_4', 'PE10_5', 'PE10_6', 'PE10_7', 'PE10_8', 'PE11', 'PE12', 'PE13_1', 'PE13_2', 'PE14', 'PE15', 'PE16', 'PE17', 'PE18_1', 'PE18_2', 'PF1_1', 'PF1_2', 'PF1_3', 'PF1_4', 'PF1_5', 'PF1_6', 'PF2', 'PF3', 'PF4_1', 'PF4_2', 'PF4_3', 'PF4_4', 'PF4_5', 'PF4_6', 'PF4_7', 'PF4_8', 'PF4_9', 'PF5_1', 'PF5_2', 'PF5_3', 'PF6_1', 'PF6_2', 'PF6_3', 'PF6_4', 'PF6_5', 'PF6_6', 'PF6_7', 'PF6_8', 'PF7_01', 'PF8_01', 'PF7_02', 'PF8_02', 'PF7_03', 'PF8_03', 'PF7_04', 'PF8_04', 'PF7_05', 'PF8_05', 'PF7_06', 'PF8_06', 'PF7_07', 'PF8_07', 'PF7_08', 'PF8_08', 'PF7_09', 'PF8_09', 'PF7_10', 'PF8_10', 'PF7_11', 'PF8_11', 'PF7_12', 'PF8_12', 'PF7_13', 'PF8_13', 'PF7_14', 'PF8_14', 'PF7_15', 'PF8_15', 'PF7_16', 'PF8_16', 'PF9_01', 'PF9_02', 'PF9_03', 'PF9_04', 'PF9_05', 'PF9_06', 'PF9_07', 'PF9_08', 'PF9_09', 'PF9_10', 'PF10_1', 'PF10_2', 'PF10_3', 'PF10_4', 'PF10_5', 'PF10_6', 'PF10_7', 'PF11_1', 'PF11_2', 'PF11_3', 'PF11_4', 'PF12_1', 'PF12_2', 'PF12_3', 'PF12_4', 'PF13_1', 'PF13_2', 'PF13_3', 'PF13_4', 'PF14_1_ESP', 'PF14_2_ESP', 'PF14_3_ESP', 'PG1_1', 'PG1_2', 'PG1_3', 'PG1_4', 'PG1_5', 'PG1_6', 'PG1_7', 'PG1_8', 'PG2', 'PG3_1', 'PG3_2', 'PG3_3', 'PG3_4', 'PG4_1', 'PG4_2', 'PG4_3', 'PG5_1_1', 'PG5_2_1', 'PG5_3_1', 'PG6', 'PG7', 'PG8', 'PH1_1', 'PH1_2', 'PH1_3', 'PH1_4', 'PH2_1', 'PH2_2', 'PH2_3', 'PH2_4', 'PH3', 'PH4', 'PH5', 'PH6_1', 'PH6_2', 'PH6_3', 'PH7_1', 'PH7_2', 'PH8', 'PI1', 'PI2', 'PI3', 'PI4', 'PI5', 'PJ1', 'PJ2_1', 'PJ2_2', 'PJ2_3', 'PJ2_4', 'PJ2_5', 'PJ2_6', 'PJ2_7', 'PJ2_8', 'PJ3_1', 'PJ3_2', 'PJ3_3', 'PJ3_4', 'PJ4', 'PJ5', 'PJ6_01', 'PJ6_02', 'PJ6_03', 'PJ6_04', 'PJ6_05', 'PJ6_06', 'PJ6_07', 'PJ6_08', 'PJ6_09', 'PJ6_10', 'PJ6_11', 'PJ6_12', 'PJ6_13', 'PJ7_1', 'PJ7_2', 'ENT', 'TLOC', 'FAC_ELE', 'EST_DIS', 'UPM_DIS']

# --- 1. Renombrar la columna PA1 ---
df = df.rename(columns={'PA1': 'Nivel de felicidad'})

# --- 2. Mapear 'Nivel de felicidad' a categorías ---

# Definir la función de clasificación de felicidad
def clasificar_felicidad(nivel):
    # Intentamos convertir a entero, si falla (ej. si es NaN o string no válido), devolvemos 'Sin Dato'
    try:
        nivel_int = int(nivel)
    except (ValueError, TypeError):
        return 'Sin Dato' 

    if nivel_int <= 6:
        return 'infeliz'
    elif nivel_int in [7, 8]:
        return 'feliz'
    elif nivel_int in [9, 10]:
        return 'muy feliz'
    else:
        # Para valores numéricos fuera del rango 0-10
        return 'Sin Dato' 

# Aplicar la función para crear la nueva columna categórica
df['Categoria_Felicidad'] = df['Nivel de felicidad'].apply(clasificar_felicidad)

# --- 3. Mapear la columna PJ7_1 (Generos) ---
genero_map = {
    1: 'Clasica',
    2: 'Blues',
    3: 'Hip Hop',
    4: 'Metal',
    5: 'Reggaeton',
    6: 'Salsa',
    7: 'Electronica',
    8: 'Boleros',
    9: 'Banda',
    10: 'Ranchero',
    11: 'Pop',
    12: 'Rock',
    13: 'Otros'
}

# Crear la columna de género musical
df['Genero_Musical'] = df['PJ7_1'].map(genero_map).fillna('Sin Dato') # Rellenamos NaN/valores no mapeados

# --- Mostrar el resultado de las transformaciones ---

# Imprimir las primeras filas con las columnas transformadas
print("✅ Transformaciones realizadas.")
print("\n--- Vista Previa de las Columnas Transformadas ---")
print(df[['Nivel de felicidad', 'Categoria_Felicidad', 'PJ7_1', 'Genero_Musical']].head())

# Imprimir el conteo para verificar la distribución de las nuevas categorías
print("\n--- Conteo de Categorías de Felicidad ---")
print(df['Categoria_Felicidad'].value_counts())

✅ Transformaciones realizadas.

--- Vista Previa de las Columnas Transformadas ---
   Nivel de felicidad Categoria_Felicidad  PJ7_1 Genero_Musical
0                  10           muy feliz      4          Metal
1                   9           muy feliz      9          Banda
2                  10           muy feliz     11            Pop
3                   5             infeliz     12           Rock
4                  10           muy feliz     11            Pop

--- Conteo de Categorías de Felicidad ---
Categoria_Felicidad
muy feliz    17463
feliz         9791
infeliz       3912
Name: count, dtype: int64

# --- 3. Cálculo para el gráfico de barras apiladas (Proporciones) ---
import matplotlib.pyplot as plt
import pandas as pd
# Crear la tabla de contingencia y normalizar por fila (por género)
contingency_table = pd.crosstab(df['Genero_Musical'], df['Categoria_Felicidad'])
contingency_table_prop = contingency_table.div(contingency_table.sum(axis=1), axis=0)

# Reordenar las columnas y géneros para mejor visualización
order = ['infeliz', 'feliz', 'muy feliz']
contingency_table_prop = contingency_table_prop.reindex(columns=order, fill_value=0)
contingency_table_prop['order_col'] = contingency_table_prop['muy feliz']
contingency_table_prop = contingency_table_prop.sort_values(by='order_col', ascending=False).drop(columns=['order_col'])

# --- 4. Generación del gráfico (Stacked Bar Chart) ---

# Paleta de colores para las categorías de felicidad
colors = {'infeliz': '#e34a33', 'feliz': '#fdbb84', 'muy feliz': '#2c7fb8'}
plt.figure(figsize=(12, 7))
contingency_table_prop.plot(kind='bar', stacked=True, color=[colors[col] for col in order], ax=plt.gca())

plt.title('Distribución Categórica de Felicidad por Género Musical', fontsize=14)
plt.xlabel('Género Musical', fontsize=12)
plt.ylabel('Proporción de Encuestados (%)', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Nivel de Felicidad', loc='lower left')
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()

# Mostrar la imagen
plt.show()

Week 1: Assignment 2 - Tools¶

For this assignment I used the National Survey on Self-Reported Well-Being (ENBIARE) 2021. Timely data¶

First I had a problem finding where my data set was. Even I can see it in my directory; Python did not recognize the path.¶

I made all of these with ChatGPT by asking, and it instructed me to add commands and then show the LLM the results. Using AI was very powerfull and let me work very fast.¶

Finally we (Mr. GPT and I) found the path and set a variable to the path to make it shorter and easier to work with¶

To use for the first time the dataset, I read it with the instruction "pd.read_csv" and used the utf-8 encoding since my dataset has special characters then I used the comand df.head() to read the header and the first 5 rows. This is how I was sure that I was up and running.¶

Next step was to be sure the size of my DB was correct. I used the "df.shape" command¶

I ran the list of columns using df.columns.tolist()¶

It is important to know what type of data there is in the DB. This is done with "df.dtypes" command. In my case all my data are integers.¶

My project for this assignment is to compare the level of happiness with the kind of music people listen to. I was curious to find if there was a correlation between this two parameters.¶

First I renamed column PA1 to 'Nivel de felicidad' with the command df.rename(columns={'Column':'Name'})¶

Second, since happines level is measure from 0 to 10, I grouped 0 to 6 in infeliz (unhappy), 7 and 8 in feliz (happy) and 9 and 10 in muy feliz (very happy)¶

I did this with an "If" function and then I applied a function to create the new categoric column called 'Categoria_felicidad'¶

Third, I mapped column PJ7_1 (musical genre) to change from integer to music genre. With that mapping I created the column 'Genero musical' (music genre)¶

To plot the data I used Matplotlib. I used to different graphics, one of stacked levels of happiness, and then I transposed the matrix to show the genres stacked on each level of happiness.¶

Learning Points¶

	FOLIO	VIV_SEL	HOGAR	N_REN	PA1	PA2	PA3_01	PA3_02	PA3_03	PA3_04	...	PJ6_11	PJ6_12	PJ6_13	PJ7_1	PJ7_2	ENT	TLOC	FAC_ELE	EST_DIS	UPM_DIS
0	1	1	1	4	10	10	10	10	10	10	...	2	1	2	4	5	1	1	1458	2	1
1	2	2	1	2	9	9	7	9	8	9	...	2	2	2	9	5	1	1	353	2	1
2	3	5	1	1	10	10	9	10	10	10	...	1	1	2	11	9	1	1	353	2	1
3	4	1	1	2	5	8	5	5	10	9	...	2	1	2	12	5	1	1	657	2	2
4	5	3	1	3	10	10	9	10	10	10	...	1	2	2	11	4	1	1	985	2	2