import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

Index(['CRASH DATE', 'CRASH TIME', 'BOROUGH', 'LATITUDE', 'LONGITUDE',
       'ON STREET NAME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE',
       'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3',
       'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'],
      dtype='object')

LogisticRegression(max_iter=1000)

Accuracy: 1.0

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]

[[34]]

/opt/conda/lib/python3.13/site-packages/sklearn/metrics/_classification.py:534: UserWarning: A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.
  warnings.warn(

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

Index(['CRASH DATE', 'CRASH TIME', 'BOROUGH', 'LATITUDE', 'LONGITUDE',
       'ON STREET NAME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE',
       'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3',
       'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'],
      dtype='object')

LogisticRegression(max_iter=1000)

Accuracy: 1.0

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]

[[34]]

/opt/conda/lib/python3.13/site-packages/sklearn/metrics/_classification.py:534: UserWarning: A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.
  warnings.warn(

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("datasets/MotorVehicle_CrashRecord.csv")
df.head(201)

df.columns

Index(['CRASH DATE', 'CRASH TIME', 'BOROUGH', 'LATITUDE', 'LONGITUDE',
       'ON STREET NAME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE',
       'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3',
       'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'],
      dtype='object')

df = df[[
    "LATITUDE",
    "LONGITUDE",
    "CONTRIBUTING FACTOR VEHICLE"
]]

df.head(201)

# Remove rows with missing latitude or longitude
df = df.dropna(subset=["LATITUDE", "LONGITUDE"])

# Replace missing or unspecified factors
df["CONTRIBUTING FACTOR VEHICLE"] = df["CONTRIBUTING FACTOR VEHICLE"].fillna("Unknown")

driver_factors = [
    "Aggressive Driving/Road Rage",
    "Pavement Slippery",
    "Unspecified",
    "Following Too Closely",
    "Passing Too Closely",
    "Failure to Yield Right-of-Way",
    "Driver Inexperience",
    "Passing or Lane Usage Improper",
    "Turning Improperly",
    "Unsafe Lane Changing",
    "Unsafe Speed",
    "Reaction to Uninvolved Vehicle",
    "Steering Failure",
    "Traffic Control Disregarded",
    "Other Vehicular",
    "Driver Inattention/Distraction",
    "Accelerator Defective",
    "Oversized Vehicle",
    "Pedestrian/Bicyclist/Other Pedestrian Error/Confusion",
    "Alcohol Involvement",
    "View Obstructed/Limited",
    "Illness",
    "Lost Consciousness",
    "Brakes Defective",
    "Backing Unsafely",
    "Glare",
    "Passenger Distraction",
    "Fell Asleep"
]

df["DRIVER_FAULT"] = df["CONTRIBUTING FACTOR VEHICLE"].apply(
    lambda x: 1 if x in driver_factors else 0
)

X = df[["LATITUDE", "LONGITUDE", "CONTRIBUTING FACTOR VEHICLE"]]
y = df["DRIVER_FAULT"]

X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 1.0

print(y_pred)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]

results = pd.DataFrame({
    "Actual": y_test.values,
    "Predicted": y_pred
})

results.head(10)

from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

[[34]]

/opt/conda/lib/python3.13/site-packages/sklearn/metrics/_classification.py:534: UserWarning: A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.
  warnings.warn(

	CRASH DATE	CRASH TIME	BOROUGH	LATITUDE	LONGITUDE	ON STREET NAME	NUMBER OF PERSONS INJURED	NUMBER OF PERSONS KILLED	NUMBER OF PEDESTRIANS INJURED	NUMBER OF PEDESTRIANS KILLED	NUMBER OF CYCLIST INJURED	NUMBER OF CYCLIST KILLED	NUMBER OF MOTORIST INJURED	NUMBER OF MOTORIST KILLED	CONTRIBUTING FACTOR VEHICLE	VEHICLE TYPE CODE 1	VEHICLE TYPE CODE 2	VEHICLE TYPE CODE 3	VEHICLE TYPE CODE 4	VEHICLE TYPE CODE 5
0	9/11/2021	2:39	NaN	NaN	NaN	WHITESTONE EXPRESSWAY	2	0	0	0	0	0	2	0	Aggressive Driving/Road Rage	Sedan	Sedan	NaN	NaN	NaN
1	3/26/2022	11:45	NaN	NaN	NaN	QUEENSBORO BRIDGE UPPER	1	0	0	0	0	0	1	0	Pavement Slippery	Sedan	NaN	NaN	NaN	NaN
2	11/1/2023	1:29	BROOKLYN	40.621790	-73.970024	OCEAN PARKWAY	1	0	0	0	0	0	1	0	Unspecified	Moped	Sedan	Sedan	NaN	NaN
3	6/29/2022	6:55	NaN	NaN	NaN	THROGS NECK BRIDGE	0	0	0	0	0	0	0	0	Following Too Closely	Sedan	Pick-up Truck	NaN	NaN	NaN
4	9/21/2022	13:21	NaN	NaN	NaN	BROOKLYN BRIDGE	0	0	0	0	0	0	0	0	Passing Too Closely	Station Wagon/Sport Utility Vehicle	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
195	3/23/2022	21:00	QUEENS	40.776764	-73.848015	NaN	0	0	0	0	0	0	0	0	Unspecified	Sedan	NaN	NaN	NaN	NaN
196	3/26/2022	12:00	QUEENS	40.747498	-73.913090	NaN	0	0	0	0	0	0	0	0	Unspecified	Sedan	NaN	NaN	NaN	NaN
197	3/26/2022	13:28	BROOKLYN	40.701637	-73.942276	GRAHAM AVENUE	1	0	0	0	0	0	1	0	Following Too Closely	Station Wagon/Sport Utility Vehicle	Station Wagon/Sport Utility Vehicle	NaN	NaN	NaN
198	3/26/2022	19:12	QUEENS	40.693490	-73.826546	NaN	0	0	0	0	0	0	0	0	Unsafe Speed	Station Wagon/Sport Utility Vehicle	Sedan	NaN	NaN	NaN
199	3/26/2022	21:29	QUEENS	40.726463	-73.859430	WETHEROLE STREET	1	0	1	0	0	0	0	0	Unsafe Speed	Bike	NaN	NaN	NaN	NaN

	LATITUDE	LONGITUDE	CONTRIBUTING FACTOR VEHICLE
0	NaN	NaN	Aggressive Driving/Road Rage
1	NaN	NaN	Pavement Slippery
2	40.621790	-73.970024	Unspecified
3	NaN	NaN	Following Too Closely
4	NaN	NaN	Passing Too Closely
...	...	...	...
195	40.776764	-73.848015	Unspecified
196	40.747498	-73.913090	Unspecified
197	40.701637	-73.942276	Following Too Closely
198	40.693490	-73.826546	Unsafe Speed
199	40.726463	-73.859430	Unsafe Speed

	penalty	'l2'
	dual	False
	tol	0.0001
	C	1.0
	fit_intercept	True
	intercept_scaling	1
	class_weight	None
	random_state	None
	solver	'lbfgs'
	max_iter	1000
	multi_class	'deprecated'
	verbose	0
	warm_start	False
	n_jobs	None
	l1_ratio	None

What is Machine Learning?¶

Machine learning is a method in data science that uses data to train models so they can automatically learn, predict, or classify new information.¶

Predict the probability of injury based on time, location, and vehicle type¶

Classify crashes as minor or severe¶

Identify high-risk hours or boroughs¶

1. Imported the Required Libraries¶

2. Loaded the Dataset¶

3. Create a target variable (Label)¶

4. Clean Data (Inputs)¶

Crash hour¶

Contributing factors¶

5. Define the target variable¶

6. Define features (X) and target (y)¶

7. Convert categorical column to numeric¶

8. Train Test Split¶

9. Train the Model¶

10. Make predictions and check accuracy¶

	Actual	Predicted
0	1	1
1	1	1
2	1	1
3	1	1
4	1	1
5	1	1
6	1	1
7	1	1
8	1	1
9	1	1