Class 5: Probability¶
A distribuition of probability describe how the values spread in a dataset. This is important in data science, because before adjust the models that we use or using ML we need to understand how the data are there. We are going to try to predict what data are missing in the data set, to improve the functionality.
Using the dataset with birth/year, lets draw a hystorigram of this data:
import json
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
# 1. Cargar datos
with open("datasets/nacimientos_espana_1975_2024.json") as f:
data = json.load(f)
births = np.array([d["births"] for d in data])
# 2. Histograma
plt.figure(figsize=(8,4))
plt.hist(births, bins=10, density=True, alpha=0.6, color='blue')
plt.title("Histograma de nacimientos")
plt.xlabel("Nacimientos")
plt.ylabel("Densidad")
plt.grid(True)
plt.show()
Now lets try to add to the historygram a line showing the adjusted normal line:
# Ajustar distribución normal
mu, sigma = norm.fit(births)
print("Media (mu):", mu)
print("Desviación estándar (sigma):", sigma)
# Dibujar histograma + curva normal ajustada
x = np.linspace(min(births), max(births), 100)
pdf = norm.pdf(x, mu, sigma)
plt.figure(figsize=(8,4))
plt.hist(births, bins=10, density=True, alpha=0.6, color='blue')
plt.plot(x, pdf, 'r-', linewidth=2, label=f"N(mu={mu:.0f}, sigma={sigma:.0f})")
plt.title("Ajuste de distribución normal a los nacimientos")
plt.xlabel("Nacimientos")
plt.ylabel("Densidad")
plt.legend()
plt.grid(True)
plt.show()
Media (mu): 448754.3 Desviación estándar (sigma): 90609.6985266478
We can see that the birth data we are using dont follow exactly a normal distribution. The data fall down in the lst years and has mor high values in the first years.
Using the temperature data from my city, León¶
I will try to do the same with real data, one that show the temperature in my city in several year.
To do this, i go to the official meorologic agency in Spain, and select the data.
- Go to the website https://opendata.aemet.es/centrodedescargas/inicio
- Enter the API key button
- provide an email to have access to the API
- you will recive in your email the API key
Update: the API just left 6 months check¶
Now lets add the API key to the python code we are going to use, so we dont need to download the data. Im going to use the most near weather station to my location: La Virgen del Camino, with the ID 2661.
Im going to start with 4 years data, from 2020 to 2024
After extracting the data directly with the API, at the point 4 lets inveistigate the distribution of the data in a historygram:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# (Opcional) intento importar scipy para ajustar una normal
try:
from scipy.stats import norm
HAS_SCIPY = True
except ImportError:
HAS_SCIPY = False
print("⚠️ SciPy no está instalado. No se hará el ajuste a distribución normal.")
# ==============================
# 1. CONFIGURACIÓN
# ==============================
API_KEY = "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJwZWlib2xAbWFpbC5jb20iLCJqdGkiOiI2NmZlMmExOS0wYjkzLTQwNmMtOTljMC1mNjQ0MDNlMDlhN2UiLCJpc3MiOiJBRU1FVCIsImlhdCI6MTc2NTEzODgwNywidXNlcklkIjoiNjZmZTJhMTktMGI5My00MDZjLTk5YzAtZjY0NDAzZTA5YTdlIiwicm9sZSI6IiJ9.LIkIHu9HvDY7dKw9nncPjXYOo8xyDWi1taIcrhr6m2U" # 🔴 CAMBIA ESTO por tu API key de AEMET
# Estación de León - Virgen del Camino (muy usada como referencia de León)
station_id = "2661"
# Rango de fechas que quieres analizar
start_date = "2024-08-01T00:00:00UTC"
end_date = "2024-12-31T23:59:59UTC"
# ==============================
# 2. DESCARGAR DATOS DE AEMET
# ==============================
base_url = (
"https://opendata.aemet.es/opendata/api/"
"valores/climatologicos/diarios/datos/"
f"fechaini/{start_date}/fechafin/{end_date}/estacion/{station_id}"
)
headers = {"api_key": API_KEY}
print("Llamando a la API de AEMET...")
resp_meta = requests.get(base_url, headers=headers)
if resp_meta.status_code != 200:
raise RuntimeError(f"Error en la petición AEMET (meta): {resp_meta.status_code} - {resp_meta.text}")
meta_json = resp_meta.json()
print("Respuesta meta:", meta_json.get("descripcion", "OK"))
data_url = meta_json.get("datos")
if not data_url:
raise RuntimeError("No se encontró la URL de datos en la respuesta de AEMET.")
print("Descargando datos reales desde:", data_url)
resp_data = requests.get(data_url)
if resp_data.status_code != 200:
raise RuntimeError(f"Error al descargar los datos: {resp_data.status_code} - {resp_data.text}")
# AEMET devuelve un JSON (lista de dicts)
data_json = resp_data.json()
# ==============================
# 3. PASAR A DATAFRAME Y LIMPIAR
# ==============================
df = pd.DataFrame(data_json)
# Convertir columnas numéricas a float (tmed, tmax, tmin suelen venir como strings)
for col in ["tmed", "tmax", "tmin"]:
if col in df.columns:
df[col] = pd.to_numeric(df[col].str.replace(",", "."), errors="coerce")
# Convertir fecha a datetime
df["fecha"] = pd.to_datetime(df["fecha"], errors="coerce")
print("\nPrimeras filas de los datos:")
print(df.head())
print("\nInformación del DataFrame:")
print(df.info())
# ==============================
# 4. ANÁLISIS DE LA DISTRIBUCIÓN
# ==============================
# Elegimos la temperatura media diaria
if "tmed" not in df.columns:
raise RuntimeError("La columna 'tmed' no está en los datos. Revisa el endpoint o las columnas disponibles.")
tmed = df["tmed"].dropna()
print("\nEstadísticas básicas de la temperatura media (tmed):")
print(tmed.describe())
# Histograma de la distribución
plt.figure(figsize=(8, 4))
plt.hist(tmed, bins=30, density=True, alpha=0.7)
plt.title("Distribución de la temperatura media diaria - León")
plt.xlabel("Temperatura media (°C)")
plt.ylabel("Densidad")
plt.grid(True)
plt.show()
# ==============================
# 5. AJUSTE A DISTRIBUCIÓN NORMAL (SI HAY SCIPY)
# ==============================
if HAS_SCIPY:
mu, sigma = norm.fit(tmed)
print(f"\nAjuste a distribución normal:")
print(f" media (mu) ≈ {mu:.2f} °C")
print(f" sigma (σ) ≈ {sigma:.2f} °C")
# Curva normal ajustada
x = np.linspace(tmed.min(), tmed.max(), 200)
pdf = norm.pdf(x, mu, sigma)
plt.figure(figsize=(8, 4))
plt.hist(tmed, bins=30, density=True, alpha=0.7, label="Datos (histograma)")
plt.plot(x, pdf, "r-", linewidth=2, label=f"N(μ={mu:.2f}, σ={sigma:.2f})")
plt.title("Temperatura media diaria - histograma + normal ajustada")
plt.xlabel("Temperatura media (°C)")
plt.ylabel("Densidad")
plt.legend()
plt.grid(True)
plt.show()
else:
print("\nNo se ha hecho ajuste a normal porque SciPy no está instalado.")
print("Si quieres incluirlo, instala SciPy con: pip install scipy")
Llamando a la API de AEMET...
Respuesta meta: exito
Descargando datos reales desde: https://opendata.aemet.es/opendata/sh/a944e043
Primeras filas de los datos:
fecha indicativo nombre provincia altitud tmed prec \
0 2024-08-01 2661 LEÓN, VIRGEN DEL CAMINO LEON 916 24.2 0,0
1 2024-08-02 2661 LEÓN, VIRGEN DEL CAMINO LEON 916 22.2 0,0
2 2024-08-03 2661 LEÓN, VIRGEN DEL CAMINO LEON 916 21.5 0,0
3 2024-08-04 2661 LEÓN, VIRGEN DEL CAMINO LEON 916 22.7 0,0
4 2024-08-05 2661 LEÓN, VIRGEN DEL CAMINO LEON 916 23.7 0,0
tmin horatmin tmax ... sol presMax horaPresMax presMin horaPresMin \
0 16.5 05:30 31.8 ... 12,4 914,0 00 911,0 16
1 14.9 03:33 29.6 ... 13,2 915,6 Varias 911,6 04
2 11.9 03:55 31.1 ... 13,9 915,6 00 912,3 18
3 13.4 05:42 32.0 ... 13,7 915,1 Varias 911,6 18
4 14.9 04:42 32.5 ... 13,7 913,0 00 909,3 16
hrMedia hrMax horaHrMax hrMin horaHrMin
0 44 78 05:54 22 15:44
1 52 85 Varias 28 16:51
2 38 93 Varias 20 16:32
3 31 74 Varias 19 17:30
4 34 76 Varias 16 12:15
[5 rows x 25 columns]
Información del DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 25 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fecha 153 non-null datetime64[ns]
1 indicativo 153 non-null object
2 nombre 153 non-null object
3 provincia 153 non-null object
4 altitud 153 non-null object
5 tmed 153 non-null float64
6 prec 153 non-null object
7 tmin 153 non-null float64
8 horatmin 152 non-null object
9 tmax 153 non-null float64
10 horatmax 152 non-null object
11 dir 151 non-null object
12 velmedia 152 non-null object
13 racha 151 non-null object
14 horaracha 151 non-null object
15 sol 152 non-null object
16 presMax 152 non-null object
17 horaPresMax 152 non-null object
18 presMin 152 non-null object
19 horaPresMin 152 non-null object
20 hrMedia 152 non-null object
21 hrMax 152 non-null object
22 horaHrMax 152 non-null object
23 hrMin 152 non-null object
24 horaHrMin 152 non-null object
dtypes: datetime64[ns](1), float64(3), object(21)
memory usage: 30.0+ KB
None
Estadísticas básicas de la temperatura media (tmed):
count 153.000000
mean 12.869935
std 6.375303
min -3.600000
25% 8.500000
50% 13.000000
75% 17.200000
max 27.800000
Name: tmed, dtype: float64
Ajuste a distribución normal: media (mu) ≈ 12.87 °C sigma (σ) ≈ 6.35 °C
Downloading 2025 temperature data from the website¶
first lets check the dataset im using:
import pandas as pd
df = pd.read_csv("datasets/temp-leon.csv", encoding="latin-1")
print(df.columns.tolist())
we can see that at the first line of the file downloaded from the webpage we have the location of the data adquiring. And this is not part of the dataset. So i will open the file and delete the first two lines and let only the data. I will call this file temp-leon2.csv
After that lets work with this file
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
# ============================
# 1. CARGAR DATOS
# ============================
# Asegúrate de que temp-leon.csv está en el mismo directorio
df = pd.read_csv("datasets/temp-leon2.csv", encoding="latin-1") #si no ponemos este tipo de encoding no lee los caracteres como la ñ
# Convertimos TM a numérico por si viene con comas o texto
df["TM"] = pd.to_numeric(df["TM"], errors="coerce")
# Nos quedamos solo con los valores válidos
temps = df["TM"].dropna().values.reshape(-1, 1)
print(f"Número de muestras usadas: {temps.shape[0]}")
print("Primeras temperaturas:", temps[:10].flatten())
# ============================
# 2. AJUSTAR GMM (EM)
# ============================
# Número de componentes (puedes cambiar a 3, por ejemplo)
n_components = 2
gmm = GaussianMixture(
n_components=n_components,
random_state=42
)
gmm.fit(temps)
print("\n=== Parámetros del GMM (EM) ===")
for k in range(n_components):
mean = gmm.means_[k, 0]
# covariances_ depende del tipo, por defecto 'full'
if gmm.covariances_.ndim == 3:
var = gmm.covariances_[k, 0, 0]
else:
var = gmm.covariances_[k]
std = np.sqrt(var)
weight = gmm.weights_[k]
print(f"Componente {k+1}:")
print(f" peso (π) = {weight:.3f}")
print(f" media (μ) = {mean:.3f} °C")
print(f" desviación σ = {std:.3f} °C")
# ============================
# 3. DIBUJAR HISTOGRAMA + DENSIDADES
# ============================
# Rango de temperaturas para dibujar las curvas
x = np.linspace(temps.min() - 1, temps.max() + 1, 500).reshape(-1, 1)
# Densidad total de la mezcla
logprob = gmm.score_samples(x)
pdf_total = np.exp(logprob)
plt.figure(figsize=(8, 5))
# Histograma de datos
plt.hist(temps, bins=12, density=True, alpha=0.4, label="Datos (histograma)")
# Curva de la mezcla total
plt.plot(x, pdf_total, label="GMM (mezcla total)", linewidth=2)
# Dibujar cada componente por separado
def normal_pdf(x, mean, std):
return (1.0 / (np.sqrt(2 * np.pi) * std)) * np.exp(-0.5 * ((x - mean) / std) ** 2)
for k in range(n_components):
mean = gmm.means_[k, 0]
if gmm.covariances_.ndim == 3:
var = gmm.covariances_[k, 0, 0]
else:
var = gmm.covariances_[k]
std = np.sqrt(var)
weight = gmm.weights_[k]
pdf_comp = weight * normal_pdf(x, mean, std)
plt.plot(x, pdf_comp, "--", linewidth=2, label=f"Componente {k+1}")
plt.title("GMM con EM sobre temperatura media mensual (TM) - León")
plt.xlabel("Temperatura media (°C)")
plt.ylabel("Densidad")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
Número de muestras usadas: 13 Primeras temperaturas: [ 7.1 9.5 13.3 14.8 18.6 24. 27.4 26.9 22.9 16.7] === Parámetros del GMM (EM) === Componente 1: peso (π) = 0.545 media (μ) = 21.448 °C desviación σ = 4.834 °C Componente 2: peso (π) = 0.455 media (μ) = 11.019 °C desviación σ = 3.258 °C