import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
df = pd.read_csv("datasets/MotorVehicle_CrashRecord.csv")
df.head()
df["CRASH_HOUR"] = pd.to_datetime(df["CRASH TIME"], errors="coerce").dt.hour
df["CRASH_HOUR"].dropna().head()
plt.figure(figsize=(8, 5))
df["CRASH_HOUR"].plot(
    kind="hist",
    bins=24,
    density=True
)

plt.xlabel("Hour of Day")
plt.ylabel("Probability Density")
plt.title("Probability Distribution of Motor Vehicle Crashes by Hour")
plt.show()

injuries = df["NUMBER OF PERSONS INJURED"].dropna()

plt.figure(figsize=(8, 5))
plt.hist(injuries, bins=30, density=True)

plt.xlabel("Number of Persons Injured")
plt.ylabel("Probability Density")
plt.title("Probability Distribution of Number of Persons Injured")
plt.show()

mean = injuries.mean()
std = injuries.std()

x = injuries.sort_values()
plt.figure(figsize=(8, 5))
plt.hist(injuries, bins=30, density=True, alpha=0.6)
plt.plot(x, norm.pdf(x, mean, std))

plt.xlabel("Number of Persons Injured")
plt.ylabel("Probability Density")
plt.title("Injury Distribution vs Normal Distribution")
plt.show()
injuries.describe()

/tmp/ipykernel_477/2859588920.py:6: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  df["CRASH_HOUR"] = pd.to_datetime(df["CRASH TIME"], errors="coerce").dt.hour

count    200.000000
mean       0.515000
std        0.789233
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        4.000000
Name: NUMBER OF PERSONS INJURED, dtype: float64

WEEK 03 - Probability¶

Goal¶

- Investigate the probability distribution of your data¶

- Set up template notebooks and slides for your data set analysis¶

Probability distribution of your data¶

Template notebook and slides¶