< Home
Week 2: Fitting¶
Fit a function to your data
Here is the dataset reloaded from last assignment: Average temperature in the Nordic Capitals downloaded from https://nordicstatistics.org/areas/geography-and-climate/
import pandas as pd
# Load the dataset
df = pd.read_csv("datasets/nordicaveragetemp.csv")
# Show the first 5 rows
df.head()
| Category | DK | FO | GL | FI | AX | IS | NO | SE | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1874 | 7.8 | NaN | NaN | 4.8 | NaN | NaN | 5.9 | 6.0 |
| 1 | 1875 | 6.9 | NaN | NaN | 1.9 | NaN | NaN | 4.3 | 4.3 |
| 2 | 1876 | 7.1 | NaN | NaN | 3.1 | NaN | NaN | 4.6 | 4.9 |
| 3 | 1877 | 6.8 | NaN | NaN | 3.3 | NaN | NaN | 3.6 | 4.7 |
| 4 | 1878 | 7.8 | NaN | NaN | 5.2 | NaN | NaN | 5.9 | 6.2 |
Show the last 5 rows
# Show the last 5 rows
df.tail()
| Category | DK | FO | GL | FI | AX | IS | NO | SE | |
|---|---|---|---|---|---|---|---|---|---|
| 146 | 2020 | 10.7 | 7.2 | -0.7 | 8.7 | 8.5 | 5.1 | 8.9 | 9.7 |
| 147 | 2021 | 9.6 | 7.0 | 0.1 | 6.6 | 6.9 | 5.4 | 7.3 | 8.1 |
| 148 | 2022 | 10.3 | 7.3 | -1.0 | 7.3 | 7.4 | 5.1 | 8.0 | 8.8 |
| 149 | 2023 | 10.1 | 7.3 | -0.3 | 7.1 | 6.7 | 5.0 | 7.0 | 8.0 |
| 150 | 2024 | 10.5 | 7.1 | -0.7 | NaN | 7.4 | 4.3 | 7.8 | 8.9 |
From last assignment where I asked ChatGPT for help. Now I want to create a visualisation using matplotlib and numpy import matplotlib.pyplot as plt import numpy as np I want each Nordic country to have it's own color and have a contunuing line, and on x axis there would be the year and y axis the temperature.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Load your dataset
df = pd.read_csv("datasets/nordicaveragetemp.csv")
# Convert the Category column (years) to integers
df["Category"] = df["Category"].astype(int)
# Nordic country columns
countries = ["DK", "FO", "GL", "FI", "AX", "IS", "NO", "SE"]
# Assign a color to each country
colors = {
"DK": "red",
"FO": "purple",
"GL": "blue",
"FI": "green",
"AX": "orange",
"IS": "cyan",
"NO": "black",
"SE": "brown"
}
# Create the plot
plt.figure(figsize=(14, 7))
for country in countries:
if country in df.columns:
plt.plot(
df["Category"],
df[country],
color=colors[country],
label=country,
linewidth=2
)
plt.title("Average Annual Temperature in Nordic Countries (°C)")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(title="Country", ncol=4)
plt.tight_layout()
plt.show()
ChatGPT help.
I am working on dataset on average temperature in the Nordic countries, I plotted this in last assignment and I was wondering how to make a fitting line, should have the points as scatter or what should I do?
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_csv("datasets/nordicaveragetemp.csv")
df["Category"] = df["Category"].astype(int)
countries = ["DK", "FO", "GL", "FI", "AX", "IS", "NO", "SE"]
colors = {
"DK": "red", "FO": "purple", "GL": "blue", "FI": "green",
"AX": "orange", "IS": "cyan", "NO": "black", "SE": "brown"
}
plt.figure(figsize=(14, 7))
x = df["Category"].values # years
for country in countries:
if country in df.columns:
y = df[country].values
# Plot actual data as scatter points
plt.scatter(x, y, color=colors[country], s=10, alpha=0.6)
# Plot line connecting the points
# plt.plot(x, y, color=colors[country], linewidth=1.2, alpha=0.7)
# Linear fit
mask = ~np.isnan(y) # handle missing values
coeff = np.polyfit(x[mask], y[mask], 1)
trend = np.poly1d(coeff)
# Plot the trend line
plt.plot(
x,
trend(x),
color=colors[country],
linestyle="--",
linewidth=2,
alpha=0.9,
label=f"{country} trend"
)
plt.title("Average Annual Temperature in Nordic Countries with Trend Lines")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(ncol=4, fontsize=9)
plt.tight_layout()
plt.show()
Now I ask ChatGPT,
how would we explore with polynomial fits (np.polyfit with degree=2)
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_csv("datasets/nordicaveragetemp.csv")
df["Category"] = df["Category"].astype(int)
countries = ["DK", "FO", "GL", "FI", "AX", "IS", "NO", "SE"]
colors = {
"DK": "red", "FO": "purple", "GL": "blue", "FI": "green",
"AX": "orange", "IS": "cyan", "NO": "black", "SE": "brown"
}
plt.figure(figsize=(14, 7))
x = df["Category"].values # years
for country in countries:
if country in df.columns:
y = df[country].values
# Scatter points for the real data
plt.scatter(x, y, color=colors[country], s=10, alpha=0.6)
# Curve fitting (degree 2 polynomial)
mask = ~np.isnan(y)
coeff2 = np.polyfit(x[mask], y[mask], 2) # quadratic fit
poly2 = np.poly1d(coeff2)
# A smooth x-axis for plotting the quadratic curve
x_smooth = np.linspace(x.min(), x.max(), 400)
plt.plot(
x_smooth,
poly2(x_smooth),
color=colors[country],
linestyle=":",
linewidth=2,
alpha=0.9,
label=f"{country} (quadratic)"
)
plt.title("Quadratic Polynomial Fit of Temperature Trends in Nordic Countries")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(ncol=4, fontsize=9)
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# -----------------------------
# 1. Load data
# -----------------------------
df = pd.read_csv("datasets/nordicaveragetemp.csv")
df["Category"] = df["Category"].astype(int)
country = "DK" # pick a country to explore
# Extract x (years) and y (temperatures), removing NaNs
mask = ~df[country].isna()
x = df.loc[mask, "Category"].values.astype(float)
y = df.loc[mask, country].values.astype(float)
# -----------------------------
# 2. Create train/test split
# -----------------------------
np.random.seed(0)
indices = np.arange(len(x))
np.random.shuffle(indices)
split = int(0.7 * len(x))
train_idx = indices[:split]
test_idx = indices[split:]
x_train = x[train_idx]
y_train = y[train_idx]
x_test = x[test_idx]
y_test = y[test_idx]
# Now x_train and x_test DEFINITELY exist
print("Train size:", len(x_train))
print("Test size:", len(x_test))
# -----------------------------
# 3. RBF basis function
# -----------------------------
def rbf_matrix(x, centers):
return np.abs(x[:, None] - centers[None, :])**3
# -----------------------------
# 4. Loop over number of centers
# -----------------------------
errors = []
coeffs = []
centers_list = []
max_centers = min(40, len(x_train))
ncenters = np.arange(1, max_centers + 1)
for ncenter in ncenters:
# choose centers randomly from training x
idx = np.random.choice(len(x_train), size=ncenter, replace=False)
centers = x_train[idx]
# design matrices
M_train = rbf_matrix(x_train, centers)
M_test = rbf_matrix(x_test, centers)
# least-squares fit
coeff, *_ = np.linalg.lstsq(M_train, y_train, rcond=None)
# predict on test data
y_pred = M_test @ coeff
error = np.mean(np.abs(y_pred - y_test))
errors.append(error)
coeffs.append(coeff)
centers_list.append(centers)
# -----------------------------
# 5. Plot error vs complexity
# -----------------------------
plt.figure(figsize=(8,4))
plt.plot(ncenters, errors, marker="o")
plt.xlabel("Number of RBF centers")
plt.ylabel("Mean absolute error")
plt.title(f"Model complexity vs test error ({country})")
plt.grid(True, linestyle="--", alpha=0.4)
plt.show()
best_idx = np.argmin(errors)
best_centers = centers_list[best_idx]
best_coeff = coeffs[best_idx]
print("Best number of centers:", ncenters[best_idx])
Train size: 105 Test size: 46
Best number of centers: 38