import pandas as pd

df = pd.read_csv("datasets/finalproject/camino_monthly_2004_2024.csv")
df.head()

import pandas as pd
import matplotlib.pyplot as plt

# === CSV 読み込み ===
df = pd.read_csv("datasets/finalproject/camino_monthly_2004_2024.csv")

months = df["Month"]

# 対象とする年
years = [
    "2004","2005","2006","2007","2008","2009","2010",
    "2011","2012","2016","2017","2018","2019",
    "2020","2021","2022","2023","2024"
]

# === カラーマップ ===
cmap = plt.get_cmap("tab20")   # 20色の視認性が高い色パレット

plt.figure(figsize=(16, 8))

for i, year in enumerate(years):
    color = cmap(i % 20)  # 20色をループ利用
    plt.plot(
        months,
        df[year],
        label=year,
        color=color,
        linewidth=2.0,
        marker="o",
        markersize=4
    )

# === タイトルやラベル ===
plt.title("Monthly Pilgrims by Year (2004–2024)", fontsize=16)
plt.xlabel("Month", fontsize=14)
plt.ylabel("Number of Pilgrims", fontsize=14)
plt.grid(True, alpha=0.3)

# === 凡例（外側へ） ===
plt.legend(
    title="Year",
    bbox_to_anchor=(1.05, 1),
    loc="upper left",
    fontsize=10
)

plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# === CSV 読み込み ===
df = pd.read_csv("datasets/finalproject/camino_monthly_2004_2024.csv")

# 月を 1〜12 の数値に変換
months_num = np.arange(1, 13)

# フィッティングしたい年
year = "2024"
y = df[year]

# === 多項式フィッティング (3次) ===
coeff = np.polyfit(months_num, y, 3)   # 3次式
poly = np.poly1d(coeff)

# フィッティング曲線用の細かい x
x_fit = np.linspace(1, 12, 200)
y_fit = poly(x_fit)

# === プロット ===
plt.figure(figsize=(10, 6))

# 点のみ
plt.scatter(months_num, y, color="blue", label=f"{year} data")

# フィッティング曲線
plt.plot(x_fit, y_fit, color="red", linewidth=2, label="Fitted curve (3rd degree)")

plt.title(f"Fitting Example for {year}")
plt.xlabel("Month")
plt.ylabel("Pilgrims")
plt.grid(True)
plt.legend()

plt.show()

# === 近似式表示 ===
print("近似多項式（3次）:")
print(poly)

近似多項式（3次）:
        3        2
-448.7 x + 7080 x - 2.286e+04 x + 2.14e+04

import numpy as np
import matplotlib.pyplot as plt

# ========================
# 1. 手入力データ（2025年）
# ========================
data_2025 = {
    "Jan": 2781,
    "Feb": 3410,
    "Mar": 13437,
    "Apr": 52399,
    "May": 74556,
    "Jun": 69529,
    "Jul": 63856,
    "Aug": 73121,
    "Sep": 76985,
    "Oct": 57349,
    "Nov": 10022,
    "Dec": 926
}

# 月の順番（1〜12）
months_str = list(data_2025.keys())
months_num = np.arange(1, 13)     # 1~12
values = np.array(list(data_2025.values()))

# ========================
# 2. 多項式フィッティング（3次）
# ========================
coeff = np.polyfit(months_num, values, 3)
poly = np.poly1d(coeff)

# フィッティング線用の細かい x
x_fit = np.linspace(1, 12, 300)
y_fit = poly(x_fit)

# ========================
# 3. プロット
# ========================
plt.figure(figsize=(10, 6))

# 点のみ
plt.scatter(months_num, values, label="2025 data")

# フィッティング曲線
plt.plot(x_fit, y_fit, linewidth=2, label="Fitted curve (3rd degree)")

plt.title("Pilgrims per Month (2025)")
plt.xlabel("Month")
plt.ylabel("Pilgrims")

# x 軸を Jan〜Dec に
plt.xticks(months_num, months_str)

plt.grid(True)
plt.legend()

plt.show()

# ========================
# 4. 近似式の表示
# ========================
print("近似多項式（3次）:")
print(poly)

近似多項式（3次）:
        3        2
-200.8 x + 1284 x + 1.464e+04 x - 2.135e+04

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# =========================
# 1. CSV 読み込み（2004〜2024）
# =========================
df = pd.read_csv("datasets/finalproject/camino_monthly_2004_2024.csv")

months_num = np.arange(1, 13)
months_str = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]

# --- 2024 年データ ---
y_2024 = df["2024"]

# --- 2024 のフィッティング ---
coeff_2024 = np.polyfit(months_num, y_2024, 3)
poly_2024 = np.poly1d(coeff_2024)
x_fit = np.linspace(1, 12, 300)
y_2024_fit = poly_2024(x_fit)

# =========================
# 2. 手入力データ（2025年）
# =========================
data_2025 = {
    "Jan": 2781,
    "Feb": 3410,
    "Mar": 13437,
    "Apr": 52399,
    "May": 74556,
    "Jun": 69529,
    "Jul": 63856,
    "Aug": 73121,
    "Sep": 76985,
    "Oct": 57349,
    "Nov": 10022,
    "Dec": 926
}
values_2025 = np.array(list(data_2025.values()))

# --- 2025 のフィッティング ---
coeff_2025 = np.polyfit(months_num, values_2025, 3)
poly_2025 = np.poly1d(coeff_2025)
y_2025_fit = poly_2025(x_fit)

# =========================
# 3. プロット（2024 ＋ 2025）
# =========================
plt.figure(figsize=(12, 7))

# --- 2024 ---
plt.scatter(months_num, y_2024, color="blue", label="2024 data")
plt.plot(x_fit, y_2024_fit, color="blue", linestyle="--", linewidth=2, label="2024 fitted")

# --- 2025 ---
plt.scatter(months_num, values_2025, color="red", label="2025 data")
plt.plot(x_fit, y_2025_fit, color="red", linestyle="-", linewidth=2, label="2025 fitted")

plt.title("Pilgrims per Month: 2024 vs 2025")
plt.xlabel("Month")
plt.ylabel("Pilgrims")
plt.xticks(months_num, months_str)
plt.grid(True)
plt.legend()
plt.show()

# =========================
# 4. 近似式表示
# =========================
print("=== 2024 cubic approximation polynomial ===")
print(poly_2024)
print("\n=== 2025 cubic approximation polynomial ===")
print(poly_2025)

=== 2024 cubic approximation polynomial ===
        3        2
-448.7 x + 7080 x - 2.286e+04 x + 2.14e+04

=== 2025 cubic approximation polynomial ===
        3        2
-200.8 x + 1284 x + 1.464e+04 x - 2.135e+04

### 2025年巡礼者数予測のためのPythonコード（CSV読み込み対応版）
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# === 1. データの読み込みと準備 ===

FILE_PATH = "datasets/finalproject/camino_monthly_2004_2024.csv"

# CSVファイルを指定されたパスから読み込む
df_raw = pd.read_csv(FILE_PATH)

# 不要な列 ('Total', 'Ratio') があれば削除する (ソース [1] に存在する)
if 'Total' in df_raw.columns:
    df_raw = df_raw.drop(columns=['Total'])
if 'Ratio' in df_raw.columns:
    df_raw = df_raw.drop(columns=['Ratio'])

# Month列が文字列 ('Jan', 'Feb'...) の場合、数値に変換する
if df_raw['Month'].dtype == object:
    month_mapping = {m: i+1 for i, m in enumerate(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])}
    df_raw['Month'] = df_raw['Month'].map(month_mapping)


# データを長い形式（tidy format）に変換
df_long = df_raw.set_index('Month').stack().reset_index()
df_long.columns = ['Month', 'Year_str', 'Pilgrims']
df_long['Year'] = df_long['Year_str'].astype(int)


# 安定した予測モデル構築のため、パンデミック期間のデータ（2020年〜2021年）を除外
df_train = df_long[
    (df_long['Year'] < 2020) | (df_long['Year'] > 2021)
].copy()

# 極端に低い値（ノイズや一時的な閉鎖期間のデータ）も学習データから除く
df_train = df_train[df_train['Pilgrims'] > 100]


# === 2. 特徴量エンジニアリング ===
# 連続的な時間変数（トレンド）の作成: 2004年1月から始まる月数
df_train['Time'] = (df_train['Year'] - 2004) * 12 + df_train['Month']

# 季節性（サイクル）を捉えるための三角関数特徴量の作成 (周期は12ヶ月)
df_train['sin_month'] = np.sin(2 * np.pi * df_train['Month'] / 12)
df_train['cos_month'] = np.cos(2 * np.pi * df_train['Month'] / 12)

# 特徴量行列 X と目的変数 Y の定義
X = df_train[['Time', 'sin_month', 'cos_month']]
y = df_train['Pilgrims']

# === 3. モデルの適合 (Fitting) ===
# 線形最小二乗法モデル (Linear Least Squares) の初期化と学習
model = LinearRegression()
model.fit(X, y)

# 適合結果の評価 (RMSE)
y_pred_train = model.predict(X)
rmse = np.sqrt(mean_squared_error(y, y_pred_train))
print(f"訓練データのRMSE (巡礼者数): {rmse:.2f}")

# === 4. 2025年の予測 ===
# 2025年の予測データフレームを作成
months_2025 = np.arange(1, 13)
df_2025 = pd.DataFrame({'Month': months_2025, 'Year': 2025})

# 2025年のTime変数: 2004年1月を基点として計算
df_2025['Time'] = (df_2025['Year'] - 2004) * 12 + df_2025['Month']

# 2025年の季節性特徴量を計算
df_2025['sin_month'] = np.sin(2 * np.pi * df_2025['Month'] / 12)
df_2025['cos_month'] = np.cos(2 * np.pi * df_2025['Month'] / 12)

# 2025年の特徴量行列
X_2025 = df_2025[['Time', 'sin_month', 'cos_month']]

# 予測の実行
predictions_2025 = model.predict(X_2025)

# 予測結果をデータフレームに追加し、負の予測値をゼロに修正
df_2025['Predicted_Pilgrims'] = np.maximum(0, predictions_2025).round().astype(int)


# === 5. 結果の表示と可視化 (X軸改善) ===

# 2025年予測の合計
total_2025 = df_2025['Predicted_Pilgrims'].sum()

print("\n--- 2025年 巡礼者数 予測結果 (月別) ---")
print(df_2025[['Month', 'Predicted_Pilgrims']])
print(f"\n2025年 合計予測巡礼者数: {total_2025}")

# 可視化：訓練データと予測の比較
plt.figure(figsize=(16, 8)) # グラフを大きくし、ラベルが見やすくなるように調整

# 訓練データの実績値をプロット（黒点）
plt.scatter(
    df_train['Time'],
    df_train['Pilgrims'],
    label='Observed Data (Cleaned Training Set)',
    color='black',
    alpha=0.5
)

# 訓練データに対するモデルの適合結果をプロット（青線）
plt.plot(
    df_train['Time'],
    y_pred_train,
    label='Fitted Model (Training)',
    color='blue'
)

# 2025年の予測をプロット（赤線）
plt.plot(
    df_2025['Time'],
    df_2025['Predicted_Pilgrims'],
    label='2025 Prediction',
    color='red',
    linestyle='--',
    marker='o',
    markersize=5
)

# --- X軸の目盛りの改善 ---

# 1. 表示したい年のリストを作成（データ範囲 2004年から2025年まで）
# 注: 資料のデータは2013-2015が欠落しているが、時間軸は連続しているため、連続的に設定する
years_to_label = np.arange(2004, 2026, 1) # 2004年から2025年まで毎年表示

# 2. 各年の1月（Month=1）に対応する連続的なTime値を計算
# Time = (Year - 2004) * 12 + Month
tick_positions = (years_to_label - 2004) * 12 + 1
tick_labels = [str(y) for y in years_to_label]

# 3. グラフに目盛りとラベルを適用
plt.xticks(tick_positions, tick_labels, rotation=45, ha='right')

# 4. タイトルとラベルを更新
plt.title("Pilgrims Trend and Seasonal Prediction (Annual Milestones)", fontsize=18)
plt.xlabel("Year (Marked at January of Each Year)", fontsize=14)
plt.ylabel("Number of Pilgrims", fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

訓練データのRMSE (巡礼者数): 9126.08

--- 2025年 巡礼者数 予測結果 (月別) ---
    Month  Predicted_Pilgrims
0       1               15403
1       2               17222
2       3               24482
3       4               35271
4       5               46730
5       6               55822
6       7               60142
7       8               58567
8       9               51551
9      10               41007
10     11               29792
11     12               20945

2025年 合計予測巡礼者数: 456934

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# === 1. データの読み込みと準備 ===

FILE_PATH = "datasets/finalproject/camino_monthly_2004_2024.csv"

# CSVファイルを指定されたパスから読み込む
try:
    df_raw = pd.read_csv(FILE_PATH)
except FileNotFoundError:
    print(f"致命的なエラー: CSVファイル '{FILE_PATH}' が見つかりませんでした。パスを確認してください。")
    raise

# 不要な列を削除 (ソース [1] に存在する Total および Ratio)
if 'Total' in df_raw.columns:
    df_raw = df_raw.drop(columns=['Total'])
if 'Ratio' in df_raw.columns:
    df_raw = df_raw.drop(columns=['Ratio'])

# Month列が文字列 ('Jan', 'Feb'...) の場合、数値に変換する
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
if df_raw['Month'].dtype == object:
    month_mapping = {m: i+1 for i, m in enumerate(month_names)}
    df_raw['Month'] = df_raw['Month'].map(month_mapping)

# データを長い形式に変換し、'Year'列と'Month'列を確実に整数型に変換
df_long = df_raw.set_index('Month').stack().reset_index()
df_long.columns = ['Month', 'Year_str', 'Pilgrims']
df_long['Year'] = df_long['Year_str'].astype(int)
df_long['Month'] = df_long['Month'].astype(int) 

# パンデミック期間のデータ（2020年〜2021年）を除外
df_train = df_long[
    (df_long['Year'] < 2020) | (df_long['Year'] > 2021)
].copy()

# 極端に低い値を除外
df_train = df_train[df_train['Pilgrims'] > 100]


# === 2. 特徴量エンジニアリング ===
df_train['Time'] = (df_train['Year'] - 2004) * 12 + df_train['Month']
df_train['sin_month'] = np.sin(2 * np.pi * df_train['Month'] / 12)
df_train['cos_month'] = np.cos(2 * np.pi * df_train['Month'] / 12)

X = df_train[['Time', 'sin_month', 'cos_month']]
y = df_train['Pilgrims']

# === 3. モデルの適合 (Fitting) ===
model = LinearRegression()
model.fit(X, y)
y_pred_train = model.predict(X)
df_train['Predicted_Pilgrims'] = y_pred_train 

# === 4. 2025年の予測 ===
months_2025 = np.arange(1, 13)
df_2025 = pd.DataFrame({'Month': months_2025.astype(int), 'Year': 2025}) 
df_2025['Time'] = (df_2025['Year'] - 2004) * 12 + df_2025['Month']
df_2025['sin_month'] = np.sin(2 * np.pi * df_2025['Month'] / 12)
df_2025['cos_month'] = np.cos(2 * np.pi * df_2025['Month'] / 12)

X_2025 = df_2025[['Time', 'sin_month', 'cos_month']]
predictions_2025 = model.predict(X_2025)
df_2025['Predicted_Pilgrims'] = np.maximum(0, predictions_2025).round().astype(int)

# === 5. 結果の表示と可視化 (X軸と月ごとの色分けを改善) ===

total_2025 = df_2025['Predicted_Pilgrims'].sum()
print("\n--- 2025年 巡礼者数 予測結果 (月別) ---")
print(df_2025[['Month', 'Predicted_Pilgrims']])
print(f"\n2025年 合計予測巡礼者数: {total_2025}")

plt.figure(figsize=(16, 8)) 

# カラーマップの定義 
cmap = plt.get_cmap('tab20')
month_colors = {i + 1: cmap(i / 12) for i in range(12)}
# month_names はステップ1で定義済み

# 1. 訓練データの実績値と適合結果を月ごとにプロット
for month_num_float in df_train['Month'].unique():
    month_num = int(month_num_float) 
    
    month_data = df_train[df_train['Month'] == month_num]
    month_name = month_names[month_num - 1]
    color = month_colors[month_num]

    # 訓練データの実績値をプロット (散布図)
    plt.scatter(
        month_data['Time'],
        month_data['Pilgrims'],
        color=color,
        alpha=0.6,
        s=40,
        label=f'Observed: {month_name}' if month_num == 1 else "" 
    )

    # 適合結果を月ごとに線でプロット (折れ線)
    month_data_sorted = month_data.sort_values(by='Time')
    plt.plot(
        month_data_sorted['Time'],
        month_data_sorted['Predicted_Pilgrims'],
        color=color,
        linestyle='-',
        linewidth=1.5,
        alpha=0.8,
        label=f'Fitted: {month_name}'
    )

# 2. 2025年の予測を該当する月の色でプロット (散布図と線)

# 2a. 2025年の予測点を月ごとの色でプロット (マーカー)
for index, row in df_2025.iterrows():
    month_num = int(row['Month'])
    color = month_colors[month_num]
    month_name = month_names[month_num - 1]

    plt.scatter(
        row['Time'],
        row['Predicted_Pilgrims'],
        color=color,
        marker='.', # ドットマーカーを使用
        s=150,
        linewidth=1.5,
        zorder=10,
        label=f'2025 Forecast: {month_name}' if month_num == 1 else "" 
    )

# 2b. 2025年の予測点を赤い点線でつなぐ (折れ線グラフ)
# 予測の時系列データを抽出
time_2025 = df_2025['Time']
pred_2025 = df_2025['Predicted_Pilgrims']

plt.plot(
    time_2025,
    pred_2025,
    color='blue',
    linestyle=':', # 点線スタイル
    linewidth=2,
    zorder=9, # マーカーの下に表示
    label='2025 Forecast Trend Line'
)


# --- X軸の目盛りの設定 ---
years_to_label = np.arange(2004, 2026, 1)
tick_positions = (years_to_label - 2004) * 12 + 1
tick_labels = [str(y) for y in years_to_label]
plt.xticks(tick_positions, tick_labels, rotation=45, ha='right')

# --- ラベルと凡例 ---
plt.title("Pilgrims Trend and Seasonal Prediction (Color-Coded by Month)", fontsize=18)
plt.xlabel("Year (Marked at January of Each Year)", fontsize=14)
plt.ylabel("Number of Pilgrims", fontsize=14)

# 凡例を外側に配置して見やすくする
plt.legend(
    title="Legend",
    bbox_to_anchor=(1.05, 1),
    loc="upper left",
    fontsize=10,
    ncol=2
)
plt.grid(True, alpha=0.3)
plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.show()

--- 2025年 巡礼者数 予測結果 (月別) ---
    Month  Predicted_Pilgrims
0       1               15403
1       2               17222
2       3               24482
3       4               35271
4       5               46730
5       6               55822
6       7               60142
7       8               58567
8       9               51551
9      10               41007
10     11               29792
11     12               20945

2025年 合計予測巡礼者数: 456934

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from numpy.polynomial.polynomial import polyfit, polyval # 多項式回帰用に追加

# ----------------------------------------------------------------------
# STEP 1: データの読み込み、モデル構築、2025年予測値の算出
# ----------------------------------------------------------------------

FILE_PATH = "datasets/finalproject/camino_monthly_2004_2024.csv"
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_mapping = {m: i+1 for i, m in enumerate(month_names)}

# CSVファイルを指定されたパスから読み込む
df_raw = pd.read_csv(FILE_PATH)

# 不要な列を削除 
if 'Total' in df_raw.columns:
    df_raw = df_raw.drop(columns=['Total'])
if 'Ratio' in df_raw.columns:
    df_raw = df_raw.drop(columns=['Ratio'])

# Month列が文字列の場合、数値に変換
if df_raw['Month'].dtype == object:
    df_raw['Month'] = df_raw['Month'].map(month_mapping)

# データを長い形式に変換し、'Year'列と'Month'列を確実に整数型に変換
df_long = df_raw.set_index('Month').stack().reset_index()
df_long.columns = ['Month', 'Year_str', 'Pilgrims']
df_long['Year'] = df_long['Year_str'].astype(int)
df_long['Month'] = df_long['Month'].astype(int) 

# 学習データの準備 (2020年, 2021年, 低い値を除外)
df_train = df_long[(df_long['Year'] < 2020) | (df_long['Year'] > 2021)].copy()
df_train = df_train[df_train['Pilgrims'] > 100]
df_train['Time'] = (df_train['Year'] - 2004) * 12 + df_train['Month']
df_train['sin_month'] = np.sin(2 * np.pi * df_train['Month'] / 12)
df_train['cos_month'] = np.cos(2 * np.pi * df_train['Month'] / 12)

# モデルの適合
model = LinearRegression()
model.fit(df_train[['Time', 'sin_month', 'cos_month']], df_train['Pilgrims'])

# 2025年予測データの生成
months_2025 = np.arange(1, 13)
df_2025_pred = pd.DataFrame({'Month': months_2025.astype(int), 'Year': 2025}) 
df_2025_pred['Time'] = (df_2025_pred['Year'] - 2004) * 12 + df_2025_pred['Month']
df_2025_pred['sin_month'] = np.sin(2 * np.pi * df_2025_pred['Month'] / 12)
df_2025_pred['cos_month'] = np.cos(2 * np.pi * df_2025_pred['Month'] / 12)
predictions_2025 = model.predict(df_2025_pred[['Time', 'sin_month', 'cos_month']])
df_2025_pred['Predicted_Pilgrims'] = np.maximum(0, predictions_2025).round().astype(int)

# ----------------------------------------------------------------------
# STEP 2: 実績データと予測データの統合
# ----------------------------------------------------------------------

# 実際の2025年実績データ
data_2025_actual = {
    "Jan": 2781, "Feb": 3410, "Mar": 13437, "Apr": 52399, "May": 74556, "Jun": 69529,
    "Jul": 63856, "Aug": 73121, "Sep": 76985, "Oct": 57349, "Nov": 10022, "Dec": 926
}

# 実績データをDataFrameに変換
df_actual = pd.DataFrame(list(data_2025_actual.items()), columns=['Month_Name', 'Actual_Pilgrims'])
df_actual['Month'] = df_actual['Month_Name'].map(month_mapping)

# 予測データと実績データを結合
df_comparison = pd.merge(
    df_2025_pred[['Month', 'Predicted_Pilgrims']],
    df_actual[['Month', 'Actual_Pilgrims', 'Month_Name']],
    on='Month',
    how='inner'
)
df_comparison = df_comparison.sort_values(by='Month')
df_comparison['Error'] = df_comparison['Actual_Pilgrims'] - df_comparison['Predicted_Pilgrims']
df_comparison['Percentage_Error'] = (df_comparison['Error'] / df_comparison['Actual_Pilgrims']) * 100


# ----------------------------------------------------------------------
# STEP 3: 予測値 vs 多項式適合線付き実績値の比較グラフの作成
# ----------------------------------------------------------------------

# --- 多項式回帰の実行 ---
# X軸として月番号 (1～12) を使用
X_poly = df_comparison['Month'].values
Y_actual = df_comparison['Actual_Pilgrims'].values

# 3次多項式を適合させる (次数は任意に変更可能)
degree = 3 
# polyfitは係数を降順（x^n, x^(n-1), ... x^0）で返します
coefficients = polyfit(X_poly, Y_actual, deg=degree)

# 適合された多項式からプロット用のY値を生成
Y_poly_fit = polyval(X_poly, coefficients)


plt.figure(figsize=(10, 6))

# 1. 予測値をプロット (線と点)
plt.plot(
    df_comparison['Month_Name'],
    df_comparison['Predicted_Pilgrims'],
    marker='o',
    linestyle='--',
    color='blue',
    label='Predicted 2025 (Model Fit)',
    linewidth=2
)

# 2. 実績値の点をプロット (マーカーはそのまま)
plt.scatter(
    df_comparison['Month_Name'],
    df_comparison['Actual_Pilgrims'],
    marker='s',
    color='red',
    label='Actual 2025 Data Points',
    #s=80, # マーカーサイズを大きく
    zorder=10 # 最前面に表示
)

# 3. 多項式回帰による実績値の適合線をプロット (線で接続)
plt.plot(
    df_comparison['Month_Name'],
    Y_poly_fit,
    linestyle='-',
    color='red',
    label=f'Actual 2025 Trend (Degree {degree} Polynomial Fit)',
    linewidth=2
)

plt.title(f'2025 Pilgrim Counts: Prediction vs. Actual Trend (Degree {degree} Fit)', fontsize=14)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Pilgrims', fontsize=12)
plt.grid(axis='y', alpha=0.5)
plt.legend()
plt.tight_layout()
plt.show()


# 比較表の表示 (Printing the comparison table and summary)

# 英語表記に変更
print("\n--- 2025 Prediction vs. Actual Detailed Comparison ---")
print(df_comparison[['Month_Name', 'Predicted_Pilgrims', 'Actual_Pilgrims', 'Error', 'Percentage_Error']].to_string(float_format="%.1f"))

total_pred = df_comparison['Predicted_Pilgrims'].sum()
total_actual = df_comparison['Actual_Pilgrims'].sum()

# 英語表記に変更
print(f"\nAnnual Total Predicted: {total_pred:,}")
print(f"Annual Total Actual: {total_actual:,}")
print(f"Total Error Percentage: {((total_actual - total_pred) / total_actual) * 100:.2f}%")

--- 2025 Prediction vs. Actual Detailed Comparison ---
   Month_Name  Predicted_Pilgrims  Actual_Pilgrims  Error  Percentage_Error
0         Jan               15403             2781 -12622            -453.9
1         Feb               17222             3410 -13812            -405.0
2         Mar               24482            13437 -11045             -82.2
3         Apr               35271            52399  17128              32.7
4         May               46730            74556  27826              37.3
5         Jun               55822            69529  13707              19.7
6         Jul               60142            63856   3714               5.8
7         Aug               58567            73121  14554              19.9
8         Sep               51551            76985  25434              33.0
9         Oct               41007            57349  16342              28.5
10        Nov               29792            10022 -19770            -197.3
11        Dec               20945              926 -20019           -2161.9

Annual Total Predicted: 456,934
Annual Total Actual: 498,371
Total Error Percentage: 8.31%

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from numpy.polynomial.polynomial import polyval

# --- 1. 定数とデータの準備 (前回のコードから再利用) ---

FILE_PATH = "datasets/finalproject/camino_monthly_2004_2024.csv"
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_mapping = {m: i+1 for i, m in enumerate(month_names)}

# CSVファイルの読み込みと前処理
df_raw = pd.read_csv(FILE_PATH)
if 'Total' in df_raw.columns: df_raw = df_raw.drop(columns=['Total'])
if 'Ratio' in df_raw.columns: df_raw = df_raw.drop(columns=['Ratio'])
if df_raw['Month'].dtype == object: df_raw['Month'] = df_raw['Month'].map(month_mapping)

df_long = df_raw.set_index('Month').stack().reset_index()
df_long.columns = ['Month', 'Year_str', 'Pilgrims']
df_long['Year'] = df_long['Year_str'].astype(int)
df_long['Month'] = df_long['Month'].astype(int)

# 学習データの準備 (2020年, 2021年, 低い値を除外)
df_train = df_long[(df_long['Year'] < 2020) | (df_long['Year'] > 2021)].copy()
df_train = df_train[df_train['Pilgrims'] > 100]

# --- 2. 特徴量エンジニアリング (Time^2 を追加) ---

df_train['Time'] = (df_train['Year'] - 2004) * 12 + df_train['Month']
# *** ここで Time^2 特徴量を追加 ***
df_train['Time_sq'] = df_train['Time'] ** 2
df_train['sin_month'] = np.sin(2 * np.pi * df_train['Month'] / 12)
df_train['cos_month'] = np.cos(2 * np.pi * df_train['Month'] / 12)

# 特徴量リストを更新
FEATURES = ['Time', 'Time_sq', 'sin_month', 'cos_month']

X_train = df_train[FEATURES]
y_train = df_train['Pilgrims']

# --- 3. モデルの適合 ---
model_quad = LinearRegression()
model_quad.fit(X_train, y_train)

# 訓練データへの適合結果
df_train['Predicted_Pilgrims'] = model_quad.predict(X_train)


# --- 4. 2025年予測データの生成と予測 ---
months_2025 = np.arange(1, 13)
df_2025_pred = pd.DataFrame({'Month': months_2025.astype(int), 'Year': 2025})
df_2025_pred['Time'] = (df_2025_pred['Year'] - 2004) * 12 + df_2025_pred['Month']
# *** 2025年予測データにも Time^2 特徴量を追加 ***
df_2025_pred['Time_sq'] = df_2025_pred['Time'] ** 2
df_2025_pred['sin_month'] = np.sin(2 * np.pi * df_2025_pred['Month'] / 12)
df_2025_pred['cos_month'] = np.cos(2 * np.pi * df_2025_pred['Month'] / 12)

X_2025 = df_2025_pred[FEATURES]
predictions_2025 = model_quad.predict(X_2025)
df_2025_pred['Predicted_Pilgrims'] = np.maximum(0, predictions_2025).round().astype(int)


# --- 5. 比較分析 (前回の実績値を使用) ---
data_2025_actual = {
    "Jan": 2781, "Feb": 3410, "Mar": 13437, "Apr": 52399, "May": 74556, "Jun": 69529,
    "Jul": 63856, "Aug": 73121, "Sep": 76985, "Oct": 57349, "Nov": 10022, "Dec": 926
}

df_actual = pd.DataFrame(list(data_2025_actual.items()), columns=['Month_Name', 'Actual_Pilgrims'])
df_actual['Month'] = df_actual['Month_Name'].map(month_mapping)

df_comparison = pd.merge(
    df_2025_pred[['Month', 'Predicted_Pilgrims']],
    df_actual[['Month', 'Actual_Pilgrims', 'Month_Name']],
    on='Month',
    how='inner'
).sort_values(by='Month')

df_comparison['Error'] = df_comparison['Actual_Pilgrims'] - df_comparison['Predicted_Pilgrims']
df_comparison['Percentage_Error'] = (df_comparison['Error'] / df_comparison['Actual_Pilgrims']) * 100


# --- 6. 結果の表示と可視化 ---

# 係数の表示
print("--- モデル係数 (Quadratic Trend Model) ---")
print(f"Intercept (切片): {model_quad.intercept_:.2f}")
for feature, coef in zip(FEATURES, model_quad.coef_):
    print(f"{feature}: {coef:.4f}")


print("\n--- 2025 Prediction vs. Actual Detailed Comparison (Quadratic Trend) ---")
print(df_comparison[['Month_Name', 'Predicted_Pilgrims', 'Actual_Pilgrims', 'Error', 'Percentage_Error']].to_string(float_format="%.1f"))

total_pred = df_comparison['Predicted_Pilgrims'].sum()
total_actual = df_comparison['Actual_Pilgrims'].sum()

print(f"\nAnnual Total Predicted (Quadratic): {total_pred:,}")
print(f"Annual Total Actual: {total_actual:,}")
print(f"Total Error Percentage: {((total_actual - total_pred) / total_actual) * 100:.2f}%")


# 可視化
plt.figure(figsize=(10, 6))

# 1. 予測値をプロット (線と点)
plt.plot(
    df_comparison['Month_Name'],
    df_comparison['Predicted_Pilgrims'],
    marker='o',
    linestyle='--',
    color='blue',
    label='2025 Forecast (Quadratic Model Fit)',
    linewidth=2
)

# 2. 実績値の点をプロット (マーカー)
plt.scatter(
    df_comparison['Month_Name'],
    df_comparison['Actual_Pilgrims'],
    marker='s',
    color='red',
    label='2025 Actual Data Points',
    zorder=10
)

# 3. 実績値の多項式適合線 (前回の3次を使用)
X_poly = df_comparison['Month'].values
Y_actual = df_comparison['Actual_Pilgrims'].values
degree = 3
coefficients = np.polyfit(X_poly, Y_actual, deg=degree)
Y_poly_fit = np.polyval(coefficients, X_poly)

plt.plot(
    df_comparison['Month_Name'],
    Y_poly_fit,
    linestyle='-',
    color='red',
    alpha=0.6,
    label=f'2025 Actual Trend (Degree {degree} Polynomial Fit)',
    linewidth=2
)

plt.title(f'2025 Pilgrim Counts: Prediction vs. Actual (Quadratic Trend)', fontsize=14)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Pilgrims', fontsize=12)
plt.grid(axis='y', alpha=0.5)
plt.legend()
plt.tight_layout()
plt.show()

--- モデル係数 (Quadratic Trend Model) ---
Intercept (切片): 9366.56
Time: 47.8790
Time_sq: 0.2959
sin_month: -13154.0899
cos_month: -17838.4006

--- 2025 Prediction vs. Actual Detailed Comparison (Quadratic Trend) ---
   Month_Name  Predicted_Pilgrims  Actual_Pilgrims  Error  Percentage_Error
0         Jan               18397             2781 -15616            -561.5
1         Feb               20309             3410 -16899            -495.6
2         Mar               27665            13437 -14228            -105.9
3         Apr               38545            52399  13854              26.4
4         May               50089            74556  24467              32.8
5         Jun               59256            69529  10273              14.8
6         Jul               63644            63856    212               0.3
7         Aug               62131            73121  10990              15.0
8         Sep               55176            76985  21809              28.3
9         Oct               44697            57349  12652              22.1
10        Nov               33557            10022 -23535            -234.8
11        Dec               24794              926 -23868           -2577.5

Annual Total Predicted (Quadratic): 498,260
Annual Total Actual: 498,371
Total Error Percentage: 0.02%

	Month	2004	2005	2006	2007	2008	2009	2010	2011	2012	...	2017	2018	2019	2020	2021	2022	2023	2024	Total	Ratio
0	Jan	696	269	314	350	306	520	1169	649	512	...	1355	1628	1737	2001	60	1616	2030	2125	21569	0.40%
1	Feb	1404	558	351	666	703	681	1640	820	1300	...	1696	2181	2119	3076	14	2034	2874	2959	30705	0.60%
2	Mar	3105	3128	1093	1680	5328	1808	5879	3024	3257	...	5176	11056	7474	1948	194	7389	11497	22163	121724	2.50%
3	Apr	15556	3307	7438	8112	5655	10244	19580	14154	14753	...	26928	22067	31721	1024	34282	40811	40944	0	358280	7.20%
4	May	16862	9310	9992	12898	15988	16445	28787	19833	21766	...	35345	40665	46673	4295	48255	60332	69572	0	573438	11.60%

Element	Color / Style	Explanation
Observed Data (Cleaned Training Set)	Black Scattered Dots	These are the actual monthly pilgrim counts used to train the model. Data points corresponding to the highly disrupted 2020 and 2021 years were deliberately excluded during the model training phase. This cleaning process ensures that the model learns the stable, long-term trend and seasonality, rather than the temporary anomalies caused by the pandemic (e.g., 2020 totals are significantly lower than surrounding years).
Fitted Model (Training)	Solid Blue Line	This line represents the mathematical function (a combination of a continuous linear trend and trigonometric functions for seasonality) that was fitted to the observed black dots using the Least Squares method. The blue line is the best fit function that simultaneously captures the yearly cyclical pattern and the overall growth over time.
2025 Prediction	Red Dashed Line and Markers	This is the forecast generated by extrapolating the fitted blue function into the year 2025. It shows the expected monthly pilgrim counts for 2025, assuming the historical trend and seasonal patterns continue, and barring any major exogenous events like the pandemic.

Number of Compostela Recipients by Month¶

2025年巡礼者数予測のためのPythonコード（CSV読み込み対応版）¶