< Home
Machine learning¶
Fit a machine learning model to your data
Here is the dataset reloaded from last assignment: Average temperature in the Nordic Capitals downloaded from https://nordicstatistics.org/areas/geography-and-climate/
From last assignment where I asked ChatGPT for help. Now I want to create a visualisation using matplotlib and numpy import matplotlib.pyplot as plt import numpy as np I want each Nordic country to have it's own color and have a contunuing line, and on x axis there would be the year and y axis the temperature.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Load your dataset
df = pd.read_csv("datasets/nordicaveragetemp.csv")
# Convert the Category column (years) to integers
df["Category"] = df["Category"].astype(int)
# Nordic country columns
countries = ["DK", "FO", "GL", "FI", "AX", "IS", "NO", "SE"]
# Assign a color to each country
colors = {
"DK": "red",
"FO": "purple",
"GL": "blue",
"FI": "green",
"AX": "orange",
"IS": "cyan",
"NO": "black",
"SE": "brown"
}
# Create the plot
plt.figure(figsize=(14, 7))
for country in countries:
if country in df.columns:
plt.plot(
df["Category"],
df[country],
color=colors[country],
label=country,
linewidth=2
)
plt.title("Average Annual Temperature in Nordic Countries (°C)")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(title="Country", ncol=4)
plt.tight_layout()
plt.show()
I asked ChatGPT:
My assignment is: Fit a machine learning model to your data My dataset is about average tempurature in Nordic countries. "datasets/nordicaveragetemp.csv"
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Load the data
df = pd.read_csv("datasets/nordicaveragetemp.csv")
# Select a country – Iceland as an example
country = "IS"
# Clean dataset
df = df[["Category", country]].dropna()
df["Category"] = df["Category"].astype(int)
# Features (X) and target (y)
X = df[["Category"]] # Year
y = df[country] # Temperature
# Train-test split (hold out 20% for evaluation)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, shuffle=False # no shuffle for time series
)
# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("MSE:", mse)
print("R²:", r2)
# Plot results
plt.figure(figsize=(10,5))
plt.scatter(X_train, y_train, label="Training data")
plt.scatter(X_test, y_test, label="Test data")
plt.plot(X_test, y_pred, color="red", label="Model prediction")
plt.title(f"Linear Regression: Predicting {country} Temperature")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.legend()
plt.show()
MSE: 0.5229391757350396 R²: -1.210940197987946
Here is the same code but I selected Iceland again but changed test size to 30%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Load the data
df = pd.read_csv("datasets/nordicaveragetemp.csv")
# Select a country – Iceland as an example
country = "IS"
# Clean dataset
df = df[["Category", country]].dropna()
df["Category"] = df["Category"].astype(int)
# Features (X) and target (y)
X = df[["Category"]] # Year
y = df[country] # Temperature
# Train-test split (hold out 20% for evaluation)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, shuffle=False # no shuffle for time series
)
# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("MSE:", mse)
print("R²:", r2)
# Plot results
plt.figure(figsize=(10,5))
plt.scatter(X_train, y_train, label="Training data")
plt.scatter(X_test, y_test, label="Test data")
plt.plot(X_test, y_pred, color="red", label="Model prediction")
plt.title(f"Linear Regression: Predicting {country} Temperature")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.legend()
plt.show()
MSE: 1.5390092566312272 R²: -6.850876535756213
Here is the code for Greenland
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Load the data
df = pd.read_csv("datasets/nordicaveragetemp.csv")
# Select a country – Greenland as an example
country = "GL"
# Clean dataset
df = df[["Category", country]].dropna()
df["Category"] = df["Category"].astype(int)
# Features (X) and target (y)
X = df[["Category"]] # Year
y = df[country] # Temperature
# Train-test split (hold out 20% for evaluation)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, shuffle=False # no shuffle for time series
)
# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("MSE:", mse)
print("R²:", r2)
# Plot results
plt.figure(figsize=(10,5))
plt.scatter(X_train, y_train, label="Training data")
plt.scatter(X_test, y_test, label="Test data")
plt.plot(X_test, y_pred, color="red", label="Model prediction")
plt.title(f"Linear Regression: Predicting {country} Temperature")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.legend()
plt.show()
MSE: 1.6168748657950063 R²: 0.047091339555872636