import os

import numpy as np
import pandas as pd

from PIL import Image
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Show plots inside Jupyter
%matplotlib inline

plt.rcParams['figure.figsize'] = (6, 4)

import pandas as pd

df = pd.read_csv("datasets/3rd_Class_Assignment/Etiqueta_Rojo_V2.csv", sep=";")
df = df.loc[:, ["Image", "label"]]
df.head()

import pandas as pd

# Leer el CSV
df = pd.read_csv(
    "datasets/3rd_Class_Assignment/Etiqueta_Rojo_V2.csv",
    sep=";"
)

# Nos quedamos solo con las columnas correctas
df = df.loc[:, ["Image", "label"]]

# Limpiamos la columna 'label'
df["label"] = df["label"].astype(str)       # todo a texto
df["label"] = df["label"].str.strip()      # quitar espacios
df = df[df["label"] != ""]                 # eliminar vacíos
df["label"] = pd.to_numeric(df["label"], errors="coerce")
df = df.dropna(subset=["label"])           # eliminar errores
df["label"] = df["label"].astype(int)      # ahora sí, enteros

df.head(), df.dtypes

(           Image  label
 0  Picture 1 (1)    863
 1  Picture 1 (2)    863
 2  Picture 1 (3)    863
 3  Picture 1 (4)    863
 4  Picture 1 (5)    863,
 Image    object
 label     int64
 dtype: object)

# Eliminar repeticiones múltiples de ".jpeg"
while df["Image"].str.contains(".jpeg.jpeg").any():
    df["Image"] = df["Image"].str.replace(".jpeg.jpeg", ".jpeg", regex=False)

df.head()

# NORMALIZAR NOMBRES DE ARCHIVOS DEL CSV
# Normalizar nombres de archivo SIN agregar extensiones
df["Image"] = df["Image"].astype(str).str.strip().str.lower()

df.head()

import os

image_dir = "datasets/3rd_Class_Assignment/Rojo_V2"

# Listado real de archivos en la carpeta
folder_files = set(os.listdir(image_dir))

# Listado de archivos según el CSV
csv_files = set(df["Image"].tolist())

# Archivos que están en CSV pero NO en la carpeta
missing_files = csv_files - folder_files

# Archivos que están en carpeta pero NO en CSV
extra_files = folder_files - csv_files

print("Faltan estos archivos (CSV → carpeta):")
print(missing_files)

print("\nSobran estos archivos (carpeta → CSV):")
print(extra_files)

Faltan estos archivos (CSV → carpeta):
set()

Sobran estos archivos (carpeta → CSV):
set()

# Filas donde Image es NaN
empty_nan = df[df["Image"].isna()]

# Filas donde Image no es NaN pero está vacía después de limpiar
empty_blank = df[df["Image"].astype(str).str.strip() == ""]

print("Filas con NaN:")
print(empty_nan)

print("\nFilas con texto vacío:")
print(empty_blank)

Filas con NaN:
Empty DataFrame
Columns: [Image, label]
Index: []

Filas con texto vacío:
Empty DataFrame
Columns: [Image, label]
Index: []

import pandas as pd

# Leer el CSV
df = pd.read_csv("datasets/3rd_Class_Assignment/Etiqueta_Rojo_V2.csv", sep=";")

# Nos quedamos solo con las columnas correctas
df = df.loc[:, ["Image", "label"]]

# 1) Eliminar filas donde Image o label son NaN (las vacías del final)
df = df.dropna(subset=["Image", "label"])

# 2) Resetear el índice para que quede limpio
df = df.reset_index(drop=True)

df.head(), df.shape

(           Image  label
 0  Picture 1 (1)  863.0
 1  Picture 1 (2)  863.0
 2  Picture 1 (3)  863.0
 3  Picture 1 (4)  863.0
 4  Picture 1 (5)  863.0,
 (442, 2))

df["label"] = df["label"].astype(str)
df["label"] = df["label"].str.strip()
df["label"] = pd.to_numeric(df["label"], errors="coerce")
df = df.dropna(subset=["label"])
df["label"] = df["label"].astype(int)

df["Image"] = df["Image"].str.lower() + ".jpeg"

import os

image_dir = "datasets/3rd_Class_Assignment/Rojo_V2"

folder_files = set(os.listdir(image_dir))
csv_files = set(df["Image"].tolist())

missing_files = csv_files - folder_files
extra_files = folder_files - csv_files

print("Faltan estos archivos (CSV → carpeta):", missing_files)
print("Sobran estos archivos (carpeta → CSV):", extra_files)

Faltan estos archivos (CSV → carpeta): set()
Sobran estos archivos (carpeta → CSV): set()

import numpy as np
from PIL import Image
import os

image_dir = "datasets/3rd_Class_Assignment/Rojo_V2"

X = []
y = []

for _, row in df.iterrows():
    filename = row["Image"]
    label = row["label"]

    img_path = os.path.join(image_dir, filename)

    try:
        # Cargar imagen y convertir a escala de grises
        img = Image.open(img_path).convert("L")

        # Redimensionar a 64x64
        img = img.resize((64, 64))

        # Normalizar a rango [0, 1]
        X.append(np.array(img) / 255.0)
        y.append(label)

    except Exception as e:
        print("Error al cargar:", img_path, " → ", e)

# Convertir a arrays numpy
X = np.array(X, dtype="float32")
y = np.array(y, dtype="float32")

# Añadir canal para CNN: (N, 64, 64, 1)
X = X[..., np.newaxis]

X.shape, y.shape

((442, 64, 64, 1), (442,))

# X tiene forma (n_imágenes, 64, 64, 1)
X_flat = X.reshape((X.shape[0], -1))
X_flat.shape

(442, 4096)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_flat, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((353, 4096), (89, 4096), (353,), (89,))

from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
print("Modelo entrenado.")

Modelo entrenado.

from sklearn.metrics import mean_absolute_error

y_pred = rf_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

print("MAE =", mae)

MAE = 3.0522846441947586

'y_test' in globals(), 'y_pred' in globals()

(True, True)

plt.figure()
plt.scatter(y_test, y_pred)
plt.xlabel("True Reading")
plt.ylabel("Predicted Reading")
plt.title("True vs Predicted Water Meter Readings")
plt.show()

import numpy as np

y_pred_round = np.round(y_pred).astype(int)

for i in range(10):
    print(f"Real: {int(y_test[i])} | Predicho: {y_pred_round[i]}")

Real: 862 | Predicho: 862
Real: 861 | Predicho: 860
Real: 872 | Predicho: 871
Real: 804 | Predicho: 828
Real: 872 | Predicho: 872
Real: 760 | Predicho: 781
Real: 821 | Predicho: 822
Real: 815 | Predicho: 817
Real: 863 | Predicho: 862
Real: 880 | Predicho: 880

# Round predictions to nearest integer
y_pred_round = np.round(y_pred).astype(int)

# Show a few examples
n_examples = 10
print("Sample comparisons (true vs predicted):\n")
for i in range(n_examples):
    print(f"True: {int(y_test[i])}  |  Predicted: {y_pred_round[i]}")

Sample comparisons (true vs predicted):

True: 862  |  Predicted: 862
True: 861  |  Predicted: 860
True: 872  |  Predicted: 871
True: 804  |  Predicted: 828
True: 872  |  Predicted: 872
True: 760  |  Predicted: 781
True: 821  |  Predicted: 822
True: 815  |  Predicted: 817
True: 863  |  Predicted: 862
True: 880  |  Predicted: 880

import matplotlib.pyplot as plt

for i in range(5):
    plt.imshow(X_test[i].reshape(64,64), cmap="gray")
    plt.title(f"Real: {int(y_test[i])} | Predicted: {y_pred_round[i]}")
    plt.axis("off")
    plt.show()

# Show a few test images with their predictions
n_images_to_show = 5

for i in range(n_images_to_show):
    img = X_test[i].reshape(64, 64)
    true_val = int(y_test[i])
    pred_val = y_pred_round[i]

    plt.figure()
    plt.imshow(img, cmap="gray")
    plt.title(f"True: {true_val}  |  Predicted: {pred_val}")
    plt.axis("off")
    plt.show()

Week 2 - 2nd Class: Machine Learning¶

Neural networks components¶

Activation functions¶

How a neural network learn?¶

Optimization Algorithm¶

Overfitting¶

Type of networks¶

Assignment¶

Water Meter Image Project – Machine Learning Notebook¶

1. Setup and libraries¶

2. Problem description¶

3. Loading and cleaning labels¶

4. Consistency check between CSV and image folder¶

5. Loading and preprocessing images¶

6. Preparing data for a classical ML model¶

7. Train–test split¶

8. Training a Random Forest regression model¶

9. Model evaluation – Mean Absolute Error (MAE)¶

9.1 True vs predicted readings¶

10. Example predictions (numeric)¶

11. Visual examples – test images with predictions¶

12. Interpretation notes¶

	Image	label
0	Picture 1 (1)	863.0
1	Picture 1 (2)	863.0
2	Picture 1 (3)	863.0
3	Picture 1 (4)	863.0
4	Picture 1 (5)	863.0

	Image	label
0	picture 1 (1).jpeg	863
1	picture 1 (2).jpeg	863
2	picture 1 (3).jpeg	863
3	picture 1 (4).jpeg	863
4	picture 1 (5).jpeg	863