Yuichi TAMIYA - Fab Futures 2025 - Data Science
Home About Tips

Number of Compostela Recipients by Month¶

There are statistics on Oficina del Peregrino, which are beautifully visualized, but the datasets are not downloadable.

There is Statistical Information on Pilgrims Since 2004(PDF) on Asociación de Amigos del Camino de Santiago en Japón

I selected numbers from some tables in the PDF and made some CSV files.

In [1]:
import pandas as pd

df = pd.read_csv("datasets/finalproject/camino_monthly_2004_2024.csv")
df.head()
Out[1]:
Month 2004 2005 2006 2007 2008 2009 2010 2011 2012 ... 2017 2018 2019 2020 2021 2022 2023 2024 Total Ratio
0 Jan 696 269 314 350 306 520 1169 649 512 ... 1355 1628 1737 2001 60 1616 2030 2125 21569 0.40%
1 Feb 1404 558 351 666 703 681 1640 820 1300 ... 1696 2181 2119 3076 14 2034 2874 2959 30705 0.60%
2 Mar 3105 3128 1093 1680 5328 1808 5879 3024 3257 ... 5176 11056 7474 1948 194 7389 11497 22163 121724 2.50%
3 Apr 15556 3307 7438 8112 5655 10244 19580 14154 14753 ... 26928 22067 31721 1024 34282 40811 40944 0 358280 7.20%
4 May 16862 9310 9992 12898 15988 16445 28787 19833 21766 ... 35345 40665 46673 4295 48255 60332 69572 0 573438 11.60%

5 rows × 21 columns

In [2]:
import pandas as pd
import matplotlib.pyplot as plt

# === CSV 読み込み ===
df = pd.read_csv("datasets/finalproject/camino_monthly_2004_2024.csv")

months = df["Month"]

# 対象とする年
years = [
    "2004","2005","2006","2007","2008","2009","2010",
    "2011","2012","2016","2017","2018","2019",
    "2020","2021","2022","2023","2024"
]

# === カラーマップ ===
cmap = plt.get_cmap("tab20")   # 20色の視認性が高い色パレット

plt.figure(figsize=(16, 8))

for i, year in enumerate(years):
    color = cmap(i % 20)  # 20色をループ利用
    plt.plot(
        months,
        df[year],
        label=year,
        color=color,
        linewidth=2.0,
        marker="o",
        markersize=4
    )

# === タイトルやラベル ===
plt.title("Monthly Pilgrims by Year (2004–2024)", fontsize=16)
plt.xlabel("Month", fontsize=14)
plt.ylabel("Number of Pilgrims", fontsize=14)
plt.grid(True, alpha=0.3)

# === 凡例(外側へ) ===
plt.legend(
    title="Year",
    bbox_to_anchor=(1.05, 1),
    loc="upper left",
    fontsize=10
)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# === CSV 読み込み ===
df = pd.read_csv("datasets/finalproject/camino_monthly_2004_2024.csv")

# 月を 1〜12 の数値に変換
months_num = np.arange(1, 13)

# フィッティングしたい年
year = "2024"
y = df[year]

# === 多項式フィッティング (3次) ===
coeff = np.polyfit(months_num, y, 3)   # 3次式
poly = np.poly1d(coeff)

# フィッティング曲線用の細かい x
x_fit = np.linspace(1, 12, 200)
y_fit = poly(x_fit)

# === プロット ===
plt.figure(figsize=(10, 6))

# 点のみ
plt.scatter(months_num, y, color="blue", label=f"{year} data")

# フィッティング曲線
plt.plot(x_fit, y_fit, color="red", linewidth=2, label="Fitted curve (3rd degree)")

plt.title(f"Fitting Example for {year}")
plt.xlabel("Month")
plt.ylabel("Pilgrims")
plt.grid(True)
plt.legend()

plt.show()

# === 近似式表示 ===
print("近似多項式(3次):")
print(poly)
No description has been provided for this image
近似多項式(3次):
        3        2
-448.7 x + 7080 x - 2.286e+04 x + 2.14e+04

In the statistics on Oficina del Peregrino, we can see 2025 data, which is not included the above csv.

In [4]:
import numpy as np
import matplotlib.pyplot as plt

# ========================
# 1. 手入力データ(2025年)
# ========================
data_2025 = {
    "Jan": 2781,
    "Feb": 3410,
    "Mar": 13437,
    "Apr": 52399,
    "May": 74556,
    "Jun": 69529,
    "Jul": 63856,
    "Aug": 73121,
    "Sep": 76985,
    "Oct": 57349,
    "Nov": 10022,
    "Dec": 926
}

# 月の順番(1〜12)
months_str = list(data_2025.keys())
months_num = np.arange(1, 13)     # 1~12
values = np.array(list(data_2025.values()))

# ========================
# 2. 多項式フィッティング(3次)
# ========================
coeff = np.polyfit(months_num, values, 3)
poly = np.poly1d(coeff)

# フィッティング線用の細かい x
x_fit = np.linspace(1, 12, 300)
y_fit = poly(x_fit)

# ========================
# 3. プロット
# ========================
plt.figure(figsize=(10, 6))

# 点のみ
plt.scatter(months_num, values, label="2025 data")

# フィッティング曲線
plt.plot(x_fit, y_fit, linewidth=2, label="Fitted curve (3rd degree)")

plt.title("Pilgrims per Month (2025)")
plt.xlabel("Month")
plt.ylabel("Pilgrims")

# x 軸を Jan〜Dec に
plt.xticks(months_num, months_str)

plt.grid(True)
plt.legend()

plt.show()

# ========================
# 4. 近似式の表示
# ========================
print("近似多項式(3次):")
print(poly)
No description has been provided for this image
近似多項式(3次):
        3        2
-200.8 x + 1284 x + 1.464e+04 x - 2.135e+04
In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# =========================
# 1. CSV 読み込み(2004〜2024)
# =========================
df = pd.read_csv("datasets/finalproject/camino_monthly_2004_2024.csv")

months_num = np.arange(1, 13)
months_str = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]

# --- 2024 年データ ---
y_2024 = df["2024"]

# --- 2024 のフィッティング ---
coeff_2024 = np.polyfit(months_num, y_2024, 3)
poly_2024 = np.poly1d(coeff_2024)
x_fit = np.linspace(1, 12, 300)
y_2024_fit = poly_2024(x_fit)

# =========================
# 2. 手入力データ(2025年)
# =========================
data_2025 = {
    "Jan": 2781,
    "Feb": 3410,
    "Mar": 13437,
    "Apr": 52399,
    "May": 74556,
    "Jun": 69529,
    "Jul": 63856,
    "Aug": 73121,
    "Sep": 76985,
    "Oct": 57349,
    "Nov": 10022,
    "Dec": 926
}
values_2025 = np.array(list(data_2025.values()))

# --- 2025 のフィッティング ---
coeff_2025 = np.polyfit(months_num, values_2025, 3)
poly_2025 = np.poly1d(coeff_2025)
y_2025_fit = poly_2025(x_fit)

# =========================
# 3. プロット(2024 + 2025)
# =========================
plt.figure(figsize=(12, 7))

# --- 2024 ---
plt.scatter(months_num, y_2024, color="blue", label="2024 data")
plt.plot(x_fit, y_2024_fit, color="blue", linestyle="--", linewidth=2, label="2024 fitted")

# --- 2025 ---
plt.scatter(months_num, values_2025, color="red", label="2025 data")
plt.plot(x_fit, y_2025_fit, color="red", linestyle="-", linewidth=2, label="2025 fitted")

plt.title("Pilgrims per Month: 2024 vs 2025")
plt.xlabel("Month")
plt.ylabel("Pilgrims")
plt.xticks(months_num, months_str)
plt.grid(True)
plt.legend()
plt.show()

# =========================
# 4. 近似式表示
# =========================
print("=== 2024 cubic approximation polynomial ===")
print(poly_2024)
print("\n=== 2025 cubic approximation polynomial ===")
print(poly_2025)
No description has been provided for this image
=== 2024 cubic approximation polynomial ===
        3        2
-448.7 x + 7080 x - 2.286e+04 x + 2.14e+04

=== 2025 cubic approximation polynomial ===
        3        2
-200.8 x + 1284 x + 1.464e+04 x - 2.135e+04

2025年巡礼者数予測のためのPythonコード(CSV読み込み対応版)¶

In [6]:
### 2025年巡礼者数予測のためのPythonコード(CSV読み込み対応版)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# === 1. データの読み込みと準備 ===

FILE_PATH = "datasets/finalproject/camino_monthly_2004_2024.csv"

# CSVファイルを指定されたパスから読み込む
df_raw = pd.read_csv(FILE_PATH)

# 不要な列 ('Total', 'Ratio') があれば削除する (ソース [1] に存在する)
if 'Total' in df_raw.columns:
    df_raw = df_raw.drop(columns=['Total'])
if 'Ratio' in df_raw.columns:
    df_raw = df_raw.drop(columns=['Ratio'])

# Month列が文字列 ('Jan', 'Feb'...) の場合、数値に変換する
if df_raw['Month'].dtype == object:
    month_mapping = {m: i+1 for i, m in enumerate(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])}
    df_raw['Month'] = df_raw['Month'].map(month_mapping)


# データを長い形式(tidy format)に変換
df_long = df_raw.set_index('Month').stack().reset_index()
df_long.columns = ['Month', 'Year_str', 'Pilgrims']
df_long['Year'] = df_long['Year_str'].astype(int)


# 安定した予測モデル構築のため、パンデミック期間のデータ(2020年〜2021年)を除外
df_train = df_long[
    (df_long['Year'] < 2020) | (df_long['Year'] > 2021)
].copy()

# 極端に低い値(ノイズや一時的な閉鎖期間のデータ)も学習データから除く
df_train = df_train[df_train['Pilgrims'] > 100]


# === 2. 特徴量エンジニアリング ===
# 連続的な時間変数(トレンド)の作成: 2004年1月から始まる月数
df_train['Time'] = (df_train['Year'] - 2004) * 12 + df_train['Month']

# 季節性(サイクル)を捉えるための三角関数特徴量の作成 (周期は12ヶ月)
df_train['sin_month'] = np.sin(2 * np.pi * df_train['Month'] / 12)
df_train['cos_month'] = np.cos(2 * np.pi * df_train['Month'] / 12)

# 特徴量行列 X と目的変数 Y の定義
X = df_train[['Time', 'sin_month', 'cos_month']]
y = df_train['Pilgrims']

# === 3. モデルの適合 (Fitting) ===
# 線形最小二乗法モデル (Linear Least Squares) の初期化と学習
model = LinearRegression()
model.fit(X, y)

# 適合結果の評価 (RMSE)
y_pred_train = model.predict(X)
rmse = np.sqrt(mean_squared_error(y, y_pred_train))
print(f"訓練データのRMSE (巡礼者数): {rmse:.2f}")

# === 4. 2025年の予測 ===
# 2025年の予測データフレームを作成
months_2025 = np.arange(1, 13)
df_2025 = pd.DataFrame({'Month': months_2025, 'Year': 2025})

# 2025年のTime変数: 2004年1月を基点として計算
df_2025['Time'] = (df_2025['Year'] - 2004) * 12 + df_2025['Month']

# 2025年の季節性特徴量を計算
df_2025['sin_month'] = np.sin(2 * np.pi * df_2025['Month'] / 12)
df_2025['cos_month'] = np.cos(2 * np.pi * df_2025['Month'] / 12)

# 2025年の特徴量行列
X_2025 = df_2025[['Time', 'sin_month', 'cos_month']]

# 予測の実行
predictions_2025 = model.predict(X_2025)

# 予測結果をデータフレームに追加し、負の予測値をゼロに修正
df_2025['Predicted_Pilgrims'] = np.maximum(0, predictions_2025).round().astype(int)


# === 5. 結果の表示と可視化 (X軸改善) ===

# 2025年予測の合計
total_2025 = df_2025['Predicted_Pilgrims'].sum()

print("\n--- 2025年 巡礼者数 予測結果 (月別) ---")
print(df_2025[['Month', 'Predicted_Pilgrims']])
print(f"\n2025年 合計予測巡礼者数: {total_2025}")

# 可視化:訓練データと予測の比較
plt.figure(figsize=(16, 8)) # グラフを大きくし、ラベルが見やすくなるように調整

# 訓練データの実績値をプロット(黒点)
plt.scatter(
    df_train['Time'],
    df_train['Pilgrims'],
    label='Observed Data (Cleaned Training Set)',
    color='black',
    alpha=0.5
)

# 訓練データに対するモデルの適合結果をプロット(青線)
plt.plot(
    df_train['Time'],
    y_pred_train,
    label='Fitted Model (Training)',
    color='blue'
)

# 2025年の予測をプロット(赤線)
plt.plot(
    df_2025['Time'],
    df_2025['Predicted_Pilgrims'],
    label='2025 Prediction',
    color='red',
    linestyle='--',
    marker='o',
    markersize=5
)

# --- X軸の目盛りの改善 ---

# 1. 表示したい年のリストを作成(データ範囲 2004年から2025年まで)
# 注: 資料のデータは2013-2015が欠落しているが、時間軸は連続しているため、連続的に設定する
years_to_label = np.arange(2004, 2026, 1) # 2004年から2025年まで毎年表示

# 2. 各年の1月(Month=1)に対応する連続的なTime値を計算
# Time = (Year - 2004) * 12 + Month
tick_positions = (years_to_label - 2004) * 12 + 1
tick_labels = [str(y) for y in years_to_label]

# 3. グラフに目盛りとラベルを適用
plt.xticks(tick_positions, tick_labels, rotation=45, ha='right')

# 4. タイトルとラベルを更新
plt.title("Pilgrims Trend and Seasonal Prediction (Annual Milestones)", fontsize=18)
plt.xlabel("Year (Marked at January of Each Year)", fontsize=14)
plt.ylabel("Number of Pilgrims", fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
訓練データのRMSE (巡礼者数): 9126.08

--- 2025年 巡礼者数 予測結果 (月別) ---
    Month  Predicted_Pilgrims
0       1               15403
1       2               17222
2       3               24482
3       4               35271
4       5               46730
5       6               55822
6       7               60142
7       8               58567
8       9               51551
9      10               41007
10     11               29792
11     12               20945

2025年 合計予測巡礼者数: 456934
No description has been provided for this image

Graph Components and Meaning

Element Color / Style Explanation
Observed Data (Cleaned Training Set) Black Scattered Dots These are the actual monthly pilgrim counts used to train the model. Data points corresponding to the highly disrupted 2020 and 2021 years were deliberately excluded during the model training phase. This cleaning process ensures that the model learns the stable, long-term trend and seasonality, rather than the temporary anomalies caused by the pandemic (e.g., 2020 totals are significantly lower than surrounding years).
Fitted Model (Training) Solid Blue Line This line represents the mathematical function (a combination of a continuous linear trend and trigonometric functions for seasonality) that was fitted to the observed black dots using the Least Squares method. The blue line is the best fit function that simultaneously captures the yearly cyclical pattern and the overall growth over time.
2025 Prediction Red Dashed Line and Markers This is the forecast generated by extrapolating the fitted blue function into the year 2025. It shows the expected monthly pilgrim counts for 2025, assuming the historical trend and seasonal patterns continue, and barring any major exogenous events like the pandemic.

Key Insights Derived from the Model

  1. Strong Seasonality: The pronounced cyclical wave pattern clearly visible in both the observed data and the fitted blue line indicates that the model successfully captured the strong seasonal variation. The data confirms that pilgrim numbers peak dramatically during the summer months (typically June, July, August, and September), with August frequently recording the highest number of pilgrims across most years.

  2. Long-term Trend: The overall upward slope of the blue line (despite excluding the anomalous years 2020–2021) suggests that the baseline number of pilgrims has been steadily increasing since 2004, representing a continued growth in the popularity of the route.

  3. 2025 Forecast: The red dashed line projects that this growth trend will continue, with peak pilgrimage months in 2025 expected to exceed the peak numbers observed in the training data from earlier, pre-pandemic years.

Axes Definitions:

  • X-Axis (Time in Months since Jan 2004): Represents the continuous passage of time, used as the primary independent variable to model the long-term trend.
  • Y-Axis (Number of Pilgrims): Represents the monthly count of pilgrims.
In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# === 1. データの読み込みと準備 ===

FILE_PATH = "datasets/finalproject/camino_monthly_2004_2024.csv"

# CSVファイルを指定されたパスから読み込む
try:
    df_raw = pd.read_csv(FILE_PATH)
except FileNotFoundError:
    print(f"致命的なエラー: CSVファイル '{FILE_PATH}' が見つかりませんでした。パスを確認してください。")
    raise

# 不要な列を削除 (ソース [1] に存在する Total および Ratio)
if 'Total' in df_raw.columns:
    df_raw = df_raw.drop(columns=['Total'])
if 'Ratio' in df_raw.columns:
    df_raw = df_raw.drop(columns=['Ratio'])

# Month列が文字列 ('Jan', 'Feb'...) の場合、数値に変換する
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
if df_raw['Month'].dtype == object:
    month_mapping = {m: i+1 for i, m in enumerate(month_names)}
    df_raw['Month'] = df_raw['Month'].map(month_mapping)

# データを長い形式に変換し、'Year'列と'Month'列を確実に整数型に変換
df_long = df_raw.set_index('Month').stack().reset_index()
df_long.columns = ['Month', 'Year_str', 'Pilgrims']
df_long['Year'] = df_long['Year_str'].astype(int)
df_long['Month'] = df_long['Month'].astype(int) 

# パンデミック期間のデータ(2020年〜2021年)を除外
df_train = df_long[
    (df_long['Year'] < 2020) | (df_long['Year'] > 2021)
].copy()

# 極端に低い値を除外
df_train = df_train[df_train['Pilgrims'] > 100]


# === 2. 特徴量エンジニアリング ===
df_train['Time'] = (df_train['Year'] - 2004) * 12 + df_train['Month']
df_train['sin_month'] = np.sin(2 * np.pi * df_train['Month'] / 12)
df_train['cos_month'] = np.cos(2 * np.pi * df_train['Month'] / 12)

X = df_train[['Time', 'sin_month', 'cos_month']]
y = df_train['Pilgrims']

# === 3. モデルの適合 (Fitting) ===
model = LinearRegression()
model.fit(X, y)
y_pred_train = model.predict(X)
df_train['Predicted_Pilgrims'] = y_pred_train 

# === 4. 2025年の予測 ===
months_2025 = np.arange(1, 13)
df_2025 = pd.DataFrame({'Month': months_2025.astype(int), 'Year': 2025}) 
df_2025['Time'] = (df_2025['Year'] - 2004) * 12 + df_2025['Month']
df_2025['sin_month'] = np.sin(2 * np.pi * df_2025['Month'] / 12)
df_2025['cos_month'] = np.cos(2 * np.pi * df_2025['Month'] / 12)

X_2025 = df_2025[['Time', 'sin_month', 'cos_month']]
predictions_2025 = model.predict(X_2025)
df_2025['Predicted_Pilgrims'] = np.maximum(0, predictions_2025).round().astype(int)

# === 5. 結果の表示と可視化 (X軸と月ごとの色分けを改善) ===

total_2025 = df_2025['Predicted_Pilgrims'].sum()
print("\n--- 2025年 巡礼者数 予測結果 (月別) ---")
print(df_2025[['Month', 'Predicted_Pilgrims']])
print(f"\n2025年 合計予測巡礼者数: {total_2025}")

plt.figure(figsize=(16, 8)) 

# カラーマップの定義 
cmap = plt.get_cmap('tab20')
month_colors = {i + 1: cmap(i / 12) for i in range(12)}
# month_names はステップ1で定義済み

# 1. 訓練データの実績値と適合結果を月ごとにプロット
for month_num_float in df_train['Month'].unique():
    month_num = int(month_num_float) 
    
    month_data = df_train[df_train['Month'] == month_num]
    month_name = month_names[month_num - 1]
    color = month_colors[month_num]

    # 訓練データの実績値をプロット (散布図)
    plt.scatter(
        month_data['Time'],
        month_data['Pilgrims'],
        color=color,
        alpha=0.6,
        s=40,
        label=f'Observed: {month_name}' if month_num == 1 else "" 
    )

    # 適合結果を月ごとに線でプロット (折れ線)
    month_data_sorted = month_data.sort_values(by='Time')
    plt.plot(
        month_data_sorted['Time'],
        month_data_sorted['Predicted_Pilgrims'],
        color=color,
        linestyle='-',
        linewidth=1.5,
        alpha=0.8,
        label=f'Fitted: {month_name}'
    )

# 2. 2025年の予測を該当する月の色でプロット (散布図と線)

# 2a. 2025年の予測点を月ごとの色でプロット (マーカー)
for index, row in df_2025.iterrows():
    month_num = int(row['Month'])
    color = month_colors[month_num]
    month_name = month_names[month_num - 1]

    plt.scatter(
        row['Time'],
        row['Predicted_Pilgrims'],
        color=color,
        marker='.', # ドットマーカーを使用
        s=150,
        linewidth=1.5,
        zorder=10,
        label=f'2025 Forecast: {month_name}' if month_num == 1 else "" 
    )

# 2b. 2025年の予測点を赤い点線でつなぐ (折れ線グラフ)
# 予測の時系列データを抽出
time_2025 = df_2025['Time']
pred_2025 = df_2025['Predicted_Pilgrims']

plt.plot(
    time_2025,
    pred_2025,
    color='blue',
    linestyle=':', # 点線スタイル
    linewidth=2,
    zorder=9, # マーカーの下に表示
    label='2025 Forecast Trend Line'
)


# --- X軸の目盛りの設定 ---
years_to_label = np.arange(2004, 2026, 1)
tick_positions = (years_to_label - 2004) * 12 + 1
tick_labels = [str(y) for y in years_to_label]
plt.xticks(tick_positions, tick_labels, rotation=45, ha='right')

# --- ラベルと凡例 ---
plt.title("Pilgrims Trend and Seasonal Prediction (Color-Coded by Month)", fontsize=18)
plt.xlabel("Year (Marked at January of Each Year)", fontsize=14)
plt.ylabel("Number of Pilgrims", fontsize=14)

# 凡例を外側に配置して見やすくする
plt.legend(
    title="Legend",
    bbox_to_anchor=(1.05, 1),
    loc="upper left",
    fontsize=10,
    ncol=2
)
plt.grid(True, alpha=0.3)
plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.show()
--- 2025年 巡礼者数 予測結果 (月別) ---
    Month  Predicted_Pilgrims
0       1               15403
1       2               17222
2       3               24482
3       4               35271
4       5               46730
5       6               55822
6       7               60142
7       8               58567
8       9               51551
9      10               41007
10     11               29792
11     12               20945

2025年 合計予測巡礼者数: 456934
No description has been provided for this image

Using a linear regression model on historical pilgrim data to extract the trend while accounting for seasonality, and forecasting the monthly number of pilgrims for 2025.

(1) Learning seasonality It learns the annual recurring pattern in which the number of pilgrims is high in summer–autumn and low in winter.

(2) Learning the long-term trend It captures the long-term upward trend where the number of pilgrims increases as the years progress.

(3) Excluding the pandemic period (2020–2021) These years are treated as anomalies and removed because they would interfere with the model’s training.

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from numpy.polynomial.polynomial import polyfit, polyval # 多項式回帰用に追加

# ----------------------------------------------------------------------
# STEP 1: データの読み込み、モデル構築、2025年予測値の算出
# ----------------------------------------------------------------------

FILE_PATH = "datasets/finalproject/camino_monthly_2004_2024.csv"
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_mapping = {m: i+1 for i, m in enumerate(month_names)}

# CSVファイルを指定されたパスから読み込む
df_raw = pd.read_csv(FILE_PATH)

# 不要な列を削除 
if 'Total' in df_raw.columns:
    df_raw = df_raw.drop(columns=['Total'])
if 'Ratio' in df_raw.columns:
    df_raw = df_raw.drop(columns=['Ratio'])

# Month列が文字列の場合、数値に変換
if df_raw['Month'].dtype == object:
    df_raw['Month'] = df_raw['Month'].map(month_mapping)

# データを長い形式に変換し、'Year'列と'Month'列を確実に整数型に変換
df_long = df_raw.set_index('Month').stack().reset_index()
df_long.columns = ['Month', 'Year_str', 'Pilgrims']
df_long['Year'] = df_long['Year_str'].astype(int)
df_long['Month'] = df_long['Month'].astype(int) 

# 学習データの準備 (2020年, 2021年, 低い値を除外)
df_train = df_long[(df_long['Year'] < 2020) | (df_long['Year'] > 2021)].copy()
df_train = df_train[df_train['Pilgrims'] > 100]
df_train['Time'] = (df_train['Year'] - 2004) * 12 + df_train['Month']
df_train['sin_month'] = np.sin(2 * np.pi * df_train['Month'] / 12)
df_train['cos_month'] = np.cos(2 * np.pi * df_train['Month'] / 12)

# モデルの適合
model = LinearRegression()
model.fit(df_train[['Time', 'sin_month', 'cos_month']], df_train['Pilgrims'])

# 2025年予測データの生成
months_2025 = np.arange(1, 13)
df_2025_pred = pd.DataFrame({'Month': months_2025.astype(int), 'Year': 2025}) 
df_2025_pred['Time'] = (df_2025_pred['Year'] - 2004) * 12 + df_2025_pred['Month']
df_2025_pred['sin_month'] = np.sin(2 * np.pi * df_2025_pred['Month'] / 12)
df_2025_pred['cos_month'] = np.cos(2 * np.pi * df_2025_pred['Month'] / 12)
predictions_2025 = model.predict(df_2025_pred[['Time', 'sin_month', 'cos_month']])
df_2025_pred['Predicted_Pilgrims'] = np.maximum(0, predictions_2025).round().astype(int)

# ----------------------------------------------------------------------
# STEP 2: 実績データと予測データの統合
# ----------------------------------------------------------------------

# 実際の2025年実績データ
data_2025_actual = {
    "Jan": 2781, "Feb": 3410, "Mar": 13437, "Apr": 52399, "May": 74556, "Jun": 69529,
    "Jul": 63856, "Aug": 73121, "Sep": 76985, "Oct": 57349, "Nov": 10022, "Dec": 926
}

# 実績データをDataFrameに変換
df_actual = pd.DataFrame(list(data_2025_actual.items()), columns=['Month_Name', 'Actual_Pilgrims'])
df_actual['Month'] = df_actual['Month_Name'].map(month_mapping)

# 予測データと実績データを結合
df_comparison = pd.merge(
    df_2025_pred[['Month', 'Predicted_Pilgrims']],
    df_actual[['Month', 'Actual_Pilgrims', 'Month_Name']],
    on='Month',
    how='inner'
)
df_comparison = df_comparison.sort_values(by='Month')
df_comparison['Error'] = df_comparison['Actual_Pilgrims'] - df_comparison['Predicted_Pilgrims']
df_comparison['Percentage_Error'] = (df_comparison['Error'] / df_comparison['Actual_Pilgrims']) * 100


# ----------------------------------------------------------------------
# STEP 3: 予測値 vs 多項式適合線付き実績値の比較グラフの作成
# ----------------------------------------------------------------------

# --- 多項式回帰の実行 ---
# X軸として月番号 (1~12) を使用
X_poly = df_comparison['Month'].values
Y_actual = df_comparison['Actual_Pilgrims'].values

# 3次多項式を適合させる (次数は任意に変更可能)
degree = 3 
# polyfitは係数を降順(x^n, x^(n-1), ... x^0)で返します
coefficients = polyfit(X_poly, Y_actual, deg=degree)

# 適合された多項式からプロット用のY値を生成
Y_poly_fit = polyval(X_poly, coefficients)


plt.figure(figsize=(10, 6))

# 1. 予測値をプロット (線と点)
plt.plot(
    df_comparison['Month_Name'],
    df_comparison['Predicted_Pilgrims'],
    marker='o',
    linestyle='--',
    color='blue',
    label='Predicted 2025 (Model Fit)',
    linewidth=2
)

# 2. 実績値の点をプロット (マーカーはそのまま)
plt.scatter(
    df_comparison['Month_Name'],
    df_comparison['Actual_Pilgrims'],
    marker='s',
    color='red',
    label='Actual 2025 Data Points',
    #s=80, # マーカーサイズを大きく
    zorder=10 # 最前面に表示
)

# 3. 多項式回帰による実績値の適合線をプロット (線で接続)
plt.plot(
    df_comparison['Month_Name'],
    Y_poly_fit,
    linestyle='-',
    color='red',
    label=f'Actual 2025 Trend (Degree {degree} Polynomial Fit)',
    linewidth=2
)

plt.title(f'2025 Pilgrim Counts: Prediction vs. Actual Trend (Degree {degree} Fit)', fontsize=14)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Pilgrims', fontsize=12)
plt.grid(axis='y', alpha=0.5)
plt.legend()
plt.tight_layout()
plt.show()


# 比較表の表示 (Printing the comparison table and summary)

# 英語表記に変更
print("\n--- 2025 Prediction vs. Actual Detailed Comparison ---")
print(df_comparison[['Month_Name', 'Predicted_Pilgrims', 'Actual_Pilgrims', 'Error', 'Percentage_Error']].to_string(float_format="%.1f"))

total_pred = df_comparison['Predicted_Pilgrims'].sum()
total_actual = df_comparison['Actual_Pilgrims'].sum()

# 英語表記に変更
print(f"\nAnnual Total Predicted: {total_pred:,}")
print(f"Annual Total Actual: {total_actual:,}")
print(f"Total Error Percentage: {((total_actual - total_pred) / total_actual) * 100:.2f}%")
No description has been provided for this image
--- 2025 Prediction vs. Actual Detailed Comparison ---
   Month_Name  Predicted_Pilgrims  Actual_Pilgrims  Error  Percentage_Error
0         Jan               15403             2781 -12622            -453.9
1         Feb               17222             3410 -13812            -405.0
2         Mar               24482            13437 -11045             -82.2
3         Apr               35271            52399  17128              32.7
4         May               46730            74556  27826              37.3
5         Jun               55822            69529  13707              19.7
6         Jul               60142            63856   3714               5.8
7         Aug               58567            73121  14554              19.9
8         Sep               51551            76985  25434              33.0
9         Oct               41007            57349  16342              28.5
10        Nov               29792            10022 -19770            -197.3
11        Dec               20945              926 -20019           -2161.9

Annual Total Predicted: 456,934
Annual Total Actual: 498,371
Total Error Percentage: 8.31%
In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from numpy.polynomial.polynomial import polyval

# --- 1. 定数とデータの準備 (前回のコードから再利用) ---

FILE_PATH = "datasets/finalproject/camino_monthly_2004_2024.csv"
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_mapping = {m: i+1 for i, m in enumerate(month_names)}

# CSVファイルの読み込みと前処理
df_raw = pd.read_csv(FILE_PATH)
if 'Total' in df_raw.columns: df_raw = df_raw.drop(columns=['Total'])
if 'Ratio' in df_raw.columns: df_raw = df_raw.drop(columns=['Ratio'])
if df_raw['Month'].dtype == object: df_raw['Month'] = df_raw['Month'].map(month_mapping)

df_long = df_raw.set_index('Month').stack().reset_index()
df_long.columns = ['Month', 'Year_str', 'Pilgrims']
df_long['Year'] = df_long['Year_str'].astype(int)
df_long['Month'] = df_long['Month'].astype(int)

# 学習データの準備 (2020年, 2021年, 低い値を除外)
df_train = df_long[(df_long['Year'] < 2020) | (df_long['Year'] > 2021)].copy()
df_train = df_train[df_train['Pilgrims'] > 100]

# --- 2. 特徴量エンジニアリング (Time^2 を追加) ---

df_train['Time'] = (df_train['Year'] - 2004) * 12 + df_train['Month']
# *** ここで Time^2 特徴量を追加 ***
df_train['Time_sq'] = df_train['Time'] ** 2
df_train['sin_month'] = np.sin(2 * np.pi * df_train['Month'] / 12)
df_train['cos_month'] = np.cos(2 * np.pi * df_train['Month'] / 12)

# 特徴量リストを更新
FEATURES = ['Time', 'Time_sq', 'sin_month', 'cos_month']

X_train = df_train[FEATURES]
y_train = df_train['Pilgrims']

# --- 3. モデルの適合 ---
model_quad = LinearRegression()
model_quad.fit(X_train, y_train)

# 訓練データへの適合結果
df_train['Predicted_Pilgrims'] = model_quad.predict(X_train)


# --- 4. 2025年予測データの生成と予測 ---
months_2025 = np.arange(1, 13)
df_2025_pred = pd.DataFrame({'Month': months_2025.astype(int), 'Year': 2025})
df_2025_pred['Time'] = (df_2025_pred['Year'] - 2004) * 12 + df_2025_pred['Month']
# *** 2025年予測データにも Time^2 特徴量を追加 ***
df_2025_pred['Time_sq'] = df_2025_pred['Time'] ** 2
df_2025_pred['sin_month'] = np.sin(2 * np.pi * df_2025_pred['Month'] / 12)
df_2025_pred['cos_month'] = np.cos(2 * np.pi * df_2025_pred['Month'] / 12)

X_2025 = df_2025_pred[FEATURES]
predictions_2025 = model_quad.predict(X_2025)
df_2025_pred['Predicted_Pilgrims'] = np.maximum(0, predictions_2025).round().astype(int)


# --- 5. 比較分析 (前回の実績値を使用) ---
data_2025_actual = {
    "Jan": 2781, "Feb": 3410, "Mar": 13437, "Apr": 52399, "May": 74556, "Jun": 69529,
    "Jul": 63856, "Aug": 73121, "Sep": 76985, "Oct": 57349, "Nov": 10022, "Dec": 926
}

df_actual = pd.DataFrame(list(data_2025_actual.items()), columns=['Month_Name', 'Actual_Pilgrims'])
df_actual['Month'] = df_actual['Month_Name'].map(month_mapping)

df_comparison = pd.merge(
    df_2025_pred[['Month', 'Predicted_Pilgrims']],
    df_actual[['Month', 'Actual_Pilgrims', 'Month_Name']],
    on='Month',
    how='inner'
).sort_values(by='Month')

df_comparison['Error'] = df_comparison['Actual_Pilgrims'] - df_comparison['Predicted_Pilgrims']
df_comparison['Percentage_Error'] = (df_comparison['Error'] / df_comparison['Actual_Pilgrims']) * 100


# --- 6. 結果の表示と可視化 ---

# 係数の表示
print("--- モデル係数 (Quadratic Trend Model) ---")
print(f"Intercept (切片): {model_quad.intercept_:.2f}")
for feature, coef in zip(FEATURES, model_quad.coef_):
    print(f"{feature}: {coef:.4f}")


print("\n--- 2025 Prediction vs. Actual Detailed Comparison (Quadratic Trend) ---")
print(df_comparison[['Month_Name', 'Predicted_Pilgrims', 'Actual_Pilgrims', 'Error', 'Percentage_Error']].to_string(float_format="%.1f"))

total_pred = df_comparison['Predicted_Pilgrims'].sum()
total_actual = df_comparison['Actual_Pilgrims'].sum()

print(f"\nAnnual Total Predicted (Quadratic): {total_pred:,}")
print(f"Annual Total Actual: {total_actual:,}")
print(f"Total Error Percentage: {((total_actual - total_pred) / total_actual) * 100:.2f}%")


# 可視化
plt.figure(figsize=(10, 6))

# 1. 予測値をプロット (線と点)
plt.plot(
    df_comparison['Month_Name'],
    df_comparison['Predicted_Pilgrims'],
    marker='o',
    linestyle='--',
    color='blue',
    label='2025 Forecast (Quadratic Model Fit)',
    linewidth=2
)

# 2. 実績値の点をプロット (マーカー)
plt.scatter(
    df_comparison['Month_Name'],
    df_comparison['Actual_Pilgrims'],
    marker='s',
    color='red',
    label='2025 Actual Data Points',
    zorder=10
)

# 3. 実績値の多項式適合線 (前回の3次を使用)
X_poly = df_comparison['Month'].values
Y_actual = df_comparison['Actual_Pilgrims'].values
degree = 3
coefficients = np.polyfit(X_poly, Y_actual, deg=degree)
Y_poly_fit = np.polyval(coefficients, X_poly)

plt.plot(
    df_comparison['Month_Name'],
    Y_poly_fit,
    linestyle='-',
    color='red',
    alpha=0.6,
    label=f'2025 Actual Trend (Degree {degree} Polynomial Fit)',
    linewidth=2
)

plt.title(f'2025 Pilgrim Counts: Prediction vs. Actual (Quadratic Trend)', fontsize=14)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Pilgrims', fontsize=12)
plt.grid(axis='y', alpha=0.5)
plt.legend()
plt.tight_layout()
plt.show()
--- モデル係数 (Quadratic Trend Model) ---
Intercept (切片): 9366.56
Time: 47.8790
Time_sq: 0.2959
sin_month: -13154.0899
cos_month: -17838.4006

--- 2025 Prediction vs. Actual Detailed Comparison (Quadratic Trend) ---
   Month_Name  Predicted_Pilgrims  Actual_Pilgrims  Error  Percentage_Error
0         Jan               18397             2781 -15616            -561.5
1         Feb               20309             3410 -16899            -495.6
2         Mar               27665            13437 -14228            -105.9
3         Apr               38545            52399  13854              26.4
4         May               50089            74556  24467              32.8
5         Jun               59256            69529  10273              14.8
6         Jul               63644            63856    212               0.3
7         Aug               62131            73121  10990              15.0
8         Sep               55176            76985  21809              28.3
9         Oct               44697            57349  12652              22.1
10        Nov               33557            10022 -23535            -234.8
11        Dec               24794              926 -23868           -2577.5

Annual Total Predicted (Quadratic): 498,260
Annual Total Actual: 498,371
Total Error Percentage: 0.02%
No description has been provided for this image
In [ ]: