import pandas as pd

# Load the dataset
df = pd.read_csv("datasets/nordicaveragetemp.csv")

# Show the first 5 rows
df.head()

# Show the last 5 rows
df.tail()

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load your dataset
df = pd.read_csv("datasets/nordicaveragetemp.csv")

# Convert the Category column (years) to integers
df["Category"] = df["Category"].astype(int)

# Nordic country columns
countries = ["DK", "FO", "GL", "FI", "AX", "IS", "NO", "SE"]

# Assign a color to each country
colors = {
    "DK": "red",
    "FO": "purple",
    "GL": "blue",
    "FI": "green",
    "AX": "orange",
    "IS": "cyan",
    "NO": "black",
    "SE": "brown"
}

# Create the plot
plt.figure(figsize=(14, 7))

for country in countries:
    if country in df.columns:
        plt.plot(
            df["Category"],
            df[country],
            color=colors[country],
            label=country,
            linewidth=2
        )

plt.title("Average Annual Temperature in Nordic Countries (°C)")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(title="Country", ncol=4)
plt.tight_layout()
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv("datasets/nordicaveragetemp.csv")
df["Category"] = df["Category"].astype(int)

countries = ["DK", "FO", "GL", "FI", "AX", "IS", "NO", "SE"]

colors = {
    "DK": "red", "FO": "purple", "GL": "blue", "FI": "green",
    "AX": "orange", "IS": "cyan", "NO": "black", "SE": "brown"
}

plt.figure(figsize=(14, 7))

x = df["Category"].values  # years

for country in countries:
    if country in df.columns:
        y = df[country].values
        
        # Plot actual data as scatter points
        plt.scatter(x, y, color=colors[country], s=10, alpha=0.6)

        # Plot line connecting the points
       # plt.plot(x, y, color=colors[country], linewidth=1.2, alpha=0.7)

        # Linear fit
        mask = ~np.isnan(y)     # handle missing values
        coeff = np.polyfit(x[mask], y[mask], 1)
        trend = np.poly1d(coeff)

        # Plot the trend line
        plt.plot(
            x,
            trend(x),
            color=colors[country],
            linestyle="--",
            linewidth=2,
            alpha=0.9,
            label=f"{country} trend"
        )

plt.title("Average Annual Temperature in Nordic Countries with Trend Lines")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(ncol=4, fontsize=9)
plt.tight_layout()
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv("datasets/nordicaveragetemp.csv")
df["Category"] = df["Category"].astype(int)

countries = ["DK", "FO", "GL", "FI", "AX", "IS", "NO", "SE"]

colors = {
    "DK": "red", "FO": "purple", "GL": "blue", "FI": "green",
    "AX": "orange", "IS": "cyan", "NO": "black", "SE": "brown"
}

plt.figure(figsize=(14, 7))
x = df["Category"].values  # years

for country in countries:
    if country in df.columns:
        y = df[country].values
        
        # Scatter points for the real data
        plt.scatter(x, y, color=colors[country], s=10, alpha=0.6)

        # Curve fitting (degree 2 polynomial)
        mask = ~np.isnan(y)
        coeff2 = np.polyfit(x[mask], y[mask], 2)   # quadratic fit
        poly2 = np.poly1d(coeff2)

        # A smooth x-axis for plotting the quadratic curve
        x_smooth = np.linspace(x.min(), x.max(), 400)

        plt.plot(
            x_smooth,
            poly2(x_smooth),
            color=colors[country],
            linestyle=":",
            linewidth=2,
            alpha=0.9,
            label=f"{country} (quadratic)"
        )

plt.title("Quadratic Polynomial Fit of Temperature Trends in Nordic Countries")
plt.xlabel("Year")
plt.ylabel("Temperature (°C)")
plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(ncol=4, fontsize=9)
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# -----------------------------
# 1. Load data
# -----------------------------
df = pd.read_csv("datasets/nordicaveragetemp.csv")
df["Category"] = df["Category"].astype(int)

country = "DK"   # pick a country to explore

# Extract x (years) and y (temperatures), removing NaNs
mask = ~df[country].isna()
x = df.loc[mask, "Category"].values.astype(float)
y = df.loc[mask, country].values.astype(float)

# -----------------------------
# 2. Create train/test split
# -----------------------------
np.random.seed(0)
indices = np.arange(len(x))
np.random.shuffle(indices)

split = int(0.7 * len(x))
train_idx = indices[:split]
test_idx  = indices[split:]

x_train = x[train_idx]
y_train = y[train_idx]

x_test = x[test_idx]
y_test = y[test_idx]

# Now x_train and x_test DEFINITELY exist
print("Train size:", len(x_train))
print("Test size:", len(x_test))

# -----------------------------
# 3. RBF basis function
# -----------------------------
def rbf_matrix(x, centers):
    return np.abs(x[:, None] - centers[None, :])**3

# -----------------------------
# 4. Loop over number of centers
# -----------------------------
errors = []
coeffs = []
centers_list = []

max_centers = min(40, len(x_train))
ncenters = np.arange(1, max_centers + 1)

for ncenter in ncenters:
    # choose centers randomly from training x
    idx = np.random.choice(len(x_train), size=ncenter, replace=False)
    centers = x_train[idx]
    
    # design matrices
    M_train = rbf_matrix(x_train, centers)
    M_test  = rbf_matrix(x_test, centers)
    
    # least-squares fit
    coeff, *_ = np.linalg.lstsq(M_train, y_train, rcond=None)
    
    # predict on test data
    y_pred = M_test @ coeff
    error = np.mean(np.abs(y_pred - y_test))
    
    errors.append(error)
    coeffs.append(coeff)
    centers_list.append(centers)

# -----------------------------
# 5. Plot error vs complexity
# -----------------------------
plt.figure(figsize=(8,4))
plt.plot(ncenters, errors, marker="o")
plt.xlabel("Number of RBF centers")
plt.ylabel("Mean absolute error")
plt.title(f"Model complexity vs test error ({country})")
plt.grid(True, linestyle="--", alpha=0.4)
plt.show()

best_idx = np.argmin(errors)
best_centers = centers_list[best_idx]
best_coeff = coeffs[best_idx]

print("Best number of centers:", ncenters[best_idx])

Train size: 105
Test size: 46

Best number of centers: 38

	Category	DK	FO	GL	FI	AX	IS	NO	SE
146	2020	10.7	7.2	-0.7	8.7	8.5	5.1	8.9	9.7
147	2021	9.6	7.0	0.1	6.6	6.9	5.4	7.3	8.1
148	2022	10.3	7.3	-1.0	7.3	7.4	5.1	8.0	8.8
149	2023	10.1	7.3	-0.3	7.1	6.7	5.0	7.0	8.0
150	2024	10.5	7.1	-0.7	NaN	7.4	4.3	7.8	8.9

	Category	DK	FO	GL	FI	AX	IS	NO	SE
0	1874	7.8	NaN	NaN	4.8	NaN	NaN	5.9	6.0
1	1875	6.9	NaN	NaN	1.9	NaN	NaN	4.3	4.3
2	1876	7.1	NaN	NaN	3.1	NaN	NaN	4.6	4.9
3	1877	6.8	NaN	NaN	3.3	NaN	NaN	3.6	4.7
4	1878	7.8	NaN	NaN	5.2	NaN	NaN	5.9	6.2

Week 2: Fitting¶