import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# ----------------------------------------------------------------------
# STEP 1: Load CSV file
# ----------------------------------------------------------------------

csv_path = "datasets/finalproject/country.csv"

df_country = pd.read_csv(csv_path)

# Set the country column as the index
df_country = df_country.set_index("Country")

print("--- CSV import result（first 5 lines） ---")
print(df_country.head())

# Feature matrix: yearly pilgrim counts (e.g., 2004–2010, etc.)
X = df_country.values
country_names = df_country.index.values


# ----------------------------------------------------------------------
# STEP 2: Standardization
# ----------------------------------------------------------------------

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# ----------------------------------------------------------------------
# STEP 3: K-Means clustering
# ----------------------------------------------------------------------

k_optimal = 4
print(f"\nClustering... (K={k_optimal})")

kmeans = KMeans(n_clusters=k_optimal, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

df_country["Cluster"] = clusters


# ----------------------------------------------------------------------
# STEP 4: PCA visualization
# ----------------------------------------------------------------------

pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)

pca_df = pd.DataFrame(principal_components, columns=["PC1", "PC2"])
pca_df["Country"] = country_names
pca_df["Cluster"] = clusters

plt.figure(figsize=(12, 8))

# ---- Plot all countries except Japan ----
mask_japan = (pca_df["Country"] == "Japan")

scatter = plt.scatter(
    pca_df.loc[~mask_japan, "PC1"],
    pca_df.loc[~mask_japan, "PC2"],
    c=pca_df.loc[~mask_japan, "Cluster"],
    cmap="viridis",
    s=150,
    alpha=0.8
)

# ---- Plot Japan in red ----
plt.scatter(
    pca_df.loc[mask_japan, "PC1"],
    pca_df.loc[mask_japan, "PC2"],
    color="red",
    edgecolors="black",
    s=220,
    alpha=1.0,
    label="Japan"
)

# ---- Annotate country names ----
for i, row in pca_df.iterrows():
    label_color = "red" if row["Country"] == "Japan" else "black"

    plt.annotate(
        row["Country"],
        (row["PC1"], row["PC2"]),
        fontsize=10,
        color=label_color,
        xytext=(5, 0),
        textcoords="offset points"
    )

plt.title(f"Clustering countries by pilgrim patterns (K={k_optimal})")
plt.xlabel("Principal Component 1 (PC1)")
plt.ylabel("Principal Component 2 (PC2)")
plt.grid(True, alpha=0.3)
plt.colorbar(scatter)
plt.legend()
plt.show()


# ----------------------------------------------------------------------
# STEP 5: Display results
# ----------------------------------------------------------------------

print("\n--- Clustering results ---")
for i in range(k_optimal):
    cluster_countries = df_country[df_country["Cluster"] == i].index.tolist()
    print(f"Cluster {i}: {', '.join(cluster_countries)}")

print("\n--- Cluster-level averages ---")
print(df_country.groupby("Cluster").mean(numeric_only=True))

--- CSV import result（first 5 lines） ---
            2004   2005   2006   2007   2008   2009    2010   2011   2012  \
Country                                                                     
Spain     137163  52928  52248  55326  61112  79007  188089  97822  95275   
USA         2028   2047   1909   2229   2214   2540    3334   3726   7071   
Italy       7670   7430  10013  10275  10707  10341   14222  12183  12404   
Germany     6816   7155   8097  13837  15746  14789   14503  16596  15620   
Portugal    3252   2574   3365   4001   4341   4854    7786   8649  10329   

            2016    2017    2018    2019   2020    2021    2022    2023  \
Country                                                                   
Spain     124230  132479  144141  146350  37061  122128  239417  197185   
USA        15236   17522   18582   20652    451    5668   26014   32069   
Italy      23944   27073   27009   28749   2706    7817   27080   28649   
Germany    21220   23227   25296   26167   2344    6575   23215   24347   
Portugal   13245   12940   14413   17450   2971    9410   20166   20698   

            2024  
Country           
Spain     208378  
USA        38052  
Italy      28599  
Germany    23432  
Portugal   21935  

Clustering... (K=4)

--- Clustering results ---
Cluster 0: UK, Ireland, Mexico, South Korea, Canada, Australia, Poland, Brazil, Netherlands, Czech Republic, Argentina, Colombia, Denmark, Taiwan, Belgium, China, Austria, Switzerland, Hungary, Slovakia, Japan, Venezuela, Sweden, Norway, Slovenia, Finland
Cluster 1: Spain
Cluster 2: Italy, Germany
Cluster 3: USA, Portugal, France, Others

--- Cluster-level averages ---
                  2004          2005          2006     2007          2008  \
Cluster                                                                     
0           572.115385    568.461538    637.038462    754.0    866.692308   
1        137163.000000  52928.000000  52248.000000  55326.0  61112.000000   
2          7243.000000   7292.500000   9055.000000  12056.0  13226.500000   
3          3355.000000   2907.750000   3364.000000   3746.0   3760.500000   

                 2009           2010          2011          2012  \
Cluster                                                            
0          948.230769    1171.653846   1251.269231   1479.653846   
1        79007.000000  188089.000000  97822.000000  95275.000000   
2        12565.000000   14362.500000  14389.500000  14012.000000   
3         4271.750000    6214.500000   6058.000000   7679.500000   

                  2016           2017           2018           2019  \
Cluster                                                               
0          2320.423077    2513.846154    2789.692308    3029.807692   
1        124230.000000  132479.000000  144141.000000  146350.000000   
2         22582.000000   25150.000000   26152.500000   27458.000000   
3         12045.750000   13224.250000   14600.000000   16884.250000   

                 2020           2021           2022           2023  \
Cluster                                                              
0          209.923077     680.576923    2905.961538    3922.730769   
1        37061.000000  122128.000000  239417.000000  197185.000000   
2         2525.000000    7196.000000   25147.500000   26498.000000   
3         1643.750000    6174.500000   18258.750000   22049.750000   

                  2024  
Cluster                 
0          4602.615385  
1        208378.000000  
2         26015.500000  
3         23178.250000

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# ----------------------------------------------------------------------
# STEP 1: Load CSV file
# ----------------------------------------------------------------------

csv_path = "datasets/finalproject/country.csv"

df_country = pd.read_csv(csv_path)

# Set the country column as the index
df_country = df_country.set_index("Country")

# Added: 年次データのみ取り出す
df_features = df_country.drop(columns=["Cluster"], errors="ignore")


print("--- CSV import result（first 5 lines） ---")
# print(df_country.head())
# Added
#print(df_features.head())

# Feature matrix: yearly pilgrim counts (e.g., 2004–2010, etc.)
X = df_country.values
country_names = df_country.index.values


# ----------------------------------------------------------------------
# STEP 2: Standardization
# ----------------------------------------------------------------------

scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)

# Added: 各国ごとに構成比へ変換
X_ratio = df_features.div(df_features.sum(axis=1), axis=0)
# Addded: 標準化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_ratio)

# ----------------------------------------------------------------------
# STEP 3: K-Means clustering
# ----------------------------------------------------------------------

#k_optimal = 4
k_optimal = 4
print(f"\nClustering... (K={k_optimal})")

kmeans = KMeans(n_clusters=k_optimal, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

df_country["Cluster"] = clusters

# =======================================
# Added: Silhouette score を K ごとに計算する
# =======================================

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

silhouette_scores = []

print("K  | Silhouette score")
print("----------------------")

for k in range(2, 9):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)

    score = silhouette_score(X_scaled, labels)
    silhouette_scores.append(score)

    print(f"{k:<3}| {score:.3f}")
# =======================================

# ----------------------------------------------------------------------
# STEP 4: PCA visualization
# ----------------------------------------------------------------------

#pca = PCA(n_components=2)
#principal_components = pca.fit_transform(X_scaled)

#pca_df = pd.DataFrame(principal_components, columns=["PC1", "PC2"])
#pca_df["Country"] = country_names
#pca_df["Cluster"] = clusters

#plt.figure(figsize=(12, 8))

# ---- Plot all countries except Japan ----
#mask_japan = (pca_df["Country"] == "Japan")

#scatter = plt.scatter(
#    pca_df.loc[~mask_japan, "PC1"],
#    pca_df.loc[~mask_japan, "PC2"],
#    c=pca_df.loc[~mask_japan, "Cluster"],
#    cmap="viridis",
#    s=150,
#    alpha=0.8
#)

# ---- Plot Japan in red ----
#plt.scatter(
#    pca_df.loc[mask_japan, "PC1"],
#    pca_df.loc[mask_japan, "PC2"],
#    color="red",
#    edgecolors="black",
#    s=220,
#    alpha=1.0,
#    label="Japan"
#)

# ---- Annotate country names ----
#for i, row in pca_df.iterrows():
#    label_color = "red" if row["Country"] == "Japan" else "black"
#
#    plt.annotate(
#        row["Country"],
#        (row["PC1"], row["PC2"]),
#        fontsize=10,
#        color=label_color,
#        xytext=(5, 0),
#        textcoords="offset points"
#    )

#plt.title(f"Clustering countries by pilgrim patterns (K={k_optimal})")
#plt.xlabel("Principal Component 1 (PC1)")
#plt.ylabel("Principal Component 2 (PC2)")
#plt.grid(True, alpha=0.3)
#plt.colorbar(scatter)
#plt.legend()
#plt.show()

# =======================================
# Added: グラフで視覚的に判断
# =======================================

import matplotlib.pyplot as plt

plt.figure()
plt.plot(range(2, 9), silhouette_scores, marker="o")
plt.xlabel("Number of clusters (K)")
plt.ylabel("Silhouette score")
plt.title("Silhouette analysis for optimal K")
plt.grid(True)
plt.show()
# =======================================

# ----------------------------------------------------------------------
# STEP 5: Display results
# ----------------------------------------------------------------------

#print("\n--- Clustering results ---")
#for i in range(k_optimal):
#    cluster_countries = df_country[df_country["Cluster"] == i].index.tolist()
#    print(f"Cluster {i}: {', '.join(cluster_countries)}")

#print("\n--- Cluster-level averages ---")
#print(df_country.groupby("Cluster").mean(numeric_only=True))

--- CSV import result（first 5 lines） ---

Clustering... (K=4)
K  | Silhouette score
----------------------
2  | 0.569
3  | 0.313
4  | 0.310
5  | 0.252
6  | 0.221
7  | 0.219
8  | 0.239

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# ----------------------------------------------------------------------
# STEP 1: Load CSV file
# ----------------------------------------------------------------------

csv_path = "datasets/finalproject/country.csv"

df_country = pd.read_csv(csv_path)

# Set the country column as the index
df_country = df_country.set_index("Country")

print("--- CSV import result（first 5 lines） ---")
print(df_country.head())

# Feature matrix: yearly pilgrim counts (e.g., 2004–2010, etc.)
X = df_country.values
country_names = df_country.index.values


# ----------------------------------------------------------------------
# STEP 2: Standardization
# ----------------------------------------------------------------------

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# ----------------------------------------------------------------------
# STEP 3: K-Means clustering
# ----------------------------------------------------------------------

# k=2
k_optimal = 2
print(f"\nClustering... (K={k_optimal})")

kmeans = KMeans(n_clusters=k_optimal, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

df_country["Cluster"] = clusters


# ----------------------------------------------------------------------
# STEP 4: PCA visualization
# ----------------------------------------------------------------------

pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)

pca_df = pd.DataFrame(principal_components, columns=["PC1", "PC2"])
pca_df["Country"] = country_names
pca_df["Cluster"] = clusters

plt.figure(figsize=(12, 8))

# ---- Plot all countries except Japan ----
mask_japan = (pca_df["Country"] == "Japan")

scatter = plt.scatter(
    pca_df.loc[~mask_japan, "PC1"],
    pca_df.loc[~mask_japan, "PC2"],
    c=pca_df.loc[~mask_japan, "Cluster"],
    cmap="viridis",
    s=150,
    alpha=0.8
)

# ---- Plot Japan in red ----
plt.scatter(
    pca_df.loc[mask_japan, "PC1"],
    pca_df.loc[mask_japan, "PC2"],
    color="red",
    edgecolors="black",
    s=220,
    alpha=1.0,
    label="Japan"
)

# ---- Annotate country names ----
for i, row in pca_df.iterrows():
    label_color = "red" if row["Country"] == "Japan" else "black"

    plt.annotate(
        row["Country"],
        (row["PC1"], row["PC2"]),
        fontsize=10,
        color=label_color,
        xytext=(5, 0),
        textcoords="offset points"
    )

plt.title(f"Clustering countries by pilgrim patterns (K={k_optimal})")
plt.xlabel("Principal Component 1 (PC1)")
plt.ylabel("Principal Component 2 (PC2)")
plt.grid(True, alpha=0.3)
plt.colorbar(scatter)
plt.legend()
plt.show()


# ----------------------------------------------------------------------
# STEP 5: Display results
# ----------------------------------------------------------------------

print("\n--- Clustering results ---")
for i in range(k_optimal):
    cluster_countries = df_country[df_country["Cluster"] == i].index.tolist()
    print(f"Cluster {i}: {', '.join(cluster_countries)}")

print("\n--- Cluster-level averages ---")
print(df_country.groupby("Cluster").mean(numeric_only=True))

--- CSV import result（first 5 lines） ---
            2004   2005   2006   2007   2008   2009    2010   2011   2012  \
Country                                                                     
Spain     137163  52928  52248  55326  61112  79007  188089  97822  95275   
USA         2028   2047   1909   2229   2214   2540    3334   3726   7071   
Italy       7670   7430  10013  10275  10707  10341   14222  12183  12404   
Germany     6816   7155   8097  13837  15746  14789   14503  16596  15620   
Portugal    3252   2574   3365   4001   4341   4854    7786   8649  10329   

            2016    2017    2018    2019   2020    2021    2022    2023  \
Country                                                                   
Spain     124230  132479  144141  146350  37061  122128  239417  197185   
USA        15236   17522   18582   20652    451    5668   26014   32069   
Italy      23944   27073   27009   28749   2706    7817   27080   28649   
Germany    21220   23227   25296   26167   2344    6575   23215   24347   
Portugal   13245   12940   14413   17450   2971    9410   20166   20698   

            2024  
Country           
Spain     208378  
USA        38052  
Italy      28599  
Germany    23432  
Portugal   21935  

Clustering... (K=2)

--- Clustering results ---
Cluster 0: USA, Italy, Germany, Portugal, UK, France, Ireland, Mexico, South Korea, Canada, Australia, Poland, Brazil, Netherlands, Czech Republic, Argentina, Colombia, Denmark, Taiwan, Belgium, China, Austria, Switzerland, Hungary, Slovakia, Japan, Venezuela, Sweden, Norway, Slovenia, Finland, Others
Cluster 1: Spain

--- Cluster-level averages ---
                 2004       2005         2006       2007         2008  \
Cluster                                                                 
0          1336.90625   1281.125   1504.03125   1834.375   2000.90625   
1        137163.00000  52928.000  52248.00000  55326.000  61112.00000   

                2009         2010      2011         2012         2016  \
Cluster                                                                 
0         2089.71875    2626.4375   2673.25   3037.90625    4802.4375   
1        79007.00000  188089.0000  97822.00  95275.00000  124230.0000   

                 2017          2018        2019         2020          2021  \
Cluster                                                                      
0          5267.40625    5726.15625    6288.375    533.84375    1774.53125   
1        132479.00000  144141.00000  146350.000  37061.00000  122128.00000   

                 2022         2023        2024  
Cluster                                         
0          6215.15625    7599.5625    8262.875  
1        239417.00000  197185.0000  208378.000

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# ----------------------------------------------------------------------
# STEP 1: Load CSV file
# ----------------------------------------------------------------------

csv_path = "datasets/finalproject/country.csv"

df_country = pd.read_csv(csv_path)

# Set the country column as the index
df_country = df_country.set_index("Country")

# Added: 年次データのみ取り出す
df_features = df_country.drop(columns=["Cluster"], errors="ignore")


print("--- CSV import result（first 5 lines） ---")
# print(df_country.head())
# Added
#print(df_features.head())

# Feature matrix: yearly pilgrim counts (e.g., 2004–2010, etc.)
X = df_country.values
country_names = df_country.index.values


# ----------------------------------------------------------------------
# STEP 2: Standardization
# ----------------------------------------------------------------------

scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)

# Added: 各国ごとに構成比へ変換
X_ratio = df_features.div(df_features.sum(axis=1), axis=0)
# Addded: 標準化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_ratio)

# ----------------------------------------------------------------------
# STEP 3: K-Means clustering
# ----------------------------------------------------------------------

#k_optimal = 4
k_optimal = 2
print(f"\nClustering... (K={k_optimal})")

kmeans = KMeans(n_clusters=k_optimal, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

df_country["Cluster"] = clusters

# =======================================
# Added: Silhouette score を K ごとに計算する
# =======================================

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

silhouette_scores = []

print("K  | Silhouette score")
print("----------------------")

for k in range(2, 9):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)

    score = silhouette_score(X_scaled, labels)
    silhouette_scores.append(score)

    print(f"{k:<3}| {score:.3f}")
# =======================================

# ----------------------------------------------------------------------
# STEP 4: PCA visualization
# ----------------------------------------------------------------------

pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)

pca_df = pd.DataFrame(principal_components, columns=["PC1", "PC2"])
pca_df["Country"] = country_names
pca_df["Cluster"] = clusters

plt.figure(figsize=(12, 8))

# ---- Plot all countries except Japan ----
mask_japan = (pca_df["Country"] == "Japan")

scatter = plt.scatter(
    pca_df.loc[~mask_japan, "PC1"],
    pca_df.loc[~mask_japan, "PC2"],
    c=pca_df.loc[~mask_japan, "Cluster"],
    cmap="viridis",
    s=150,
    alpha=0.8
)

# ---- Plot Japan in red ----
plt.scatter(
    pca_df.loc[mask_japan, "PC1"],
    pca_df.loc[mask_japan, "PC2"],
    color="red",
    edgecolors="black",
    s=220,
    alpha=1.0,
    label="Japan"
)

# ---- Annotate country names ----
for i, row in pca_df.iterrows():
    label_color = "red" if row["Country"] == "Japan" else "black"

    plt.annotate(
        row["Country"],
        (row["PC1"], row["PC2"]),
        fontsize=10,
        color=label_color,
        xytext=(5, 0),
        textcoords="offset points"
    )

plt.title(f"Clustering countries by pilgrim patterns (K={k_optimal})")
plt.xlabel("Principal Component 1 (PC1)")
plt.ylabel("Principal Component 2 (PC2)")
plt.grid(True, alpha=0.3)
plt.colorbar(scatter)
plt.legend()
plt.show()

# =======================================
# Added: グラフで視覚的に判断
# =======================================

import matplotlib.pyplot as plt

plt.figure()
plt.plot(range(2, 9), silhouette_scores, marker="o")
plt.xlabel("Number of clusters (K)")
plt.ylabel("Silhouette score")
plt.title("Silhouette analysis for optimal K")
plt.grid(True)
plt.show()
# =======================================

# ----------------------------------------------------------------------
# STEP 5: Display results
# ----------------------------------------------------------------------

print("\n--- Clustering results ---")
for i in range(k_optimal):
    cluster_countries = df_country[df_country["Cluster"] == i].index.tolist()
    print(f"Cluster {i}: {', '.join(cluster_countries)}")

print("\n--- Cluster-level averages ---")
print(df_country.groupby("Cluster").mean(numeric_only=True))

#=============================
# Added:クラスタ平均の時系列パターン
#=============================
cluster_means = df_country.groupby("Cluster").mean(numeric_only=True)

cluster_means.T.plot(figsize=(10, 6))
plt.title("Average pilgrim patterns by cluster")
plt.xlabel("Year")
plt.ylabel("Proportion")
plt.grid(True)
plt.show()

--- CSV import result（first 5 lines） ---

Clustering... (K=2)
K  | Silhouette score
----------------------
2  | 0.569
3  | 0.313
4  | 0.310
5  | 0.252
6  | 0.221
7  | 0.219
8  | 0.239

--- Clustering results ---
Cluster 0: Taiwan, China
Cluster 1: Spain, USA, Italy, Germany, Portugal, UK, France, Ireland, Mexico, South Korea, Canada, Australia, Poland, Brazil, Netherlands, Czech Republic, Argentina, Colombia, Denmark, Belgium, Austria, Switzerland, Hungary, Slovakia, Japan, Venezuela, Sweden, Norway, Slovenia, Finland, Others

--- Cluster-level averages ---
                2004         2005         2006         2007         2008  \
Cluster                                                                    
0           0.000000     0.000000     0.000000     0.000000     0.000000   
1        5804.645161  3029.806452  3237.967742  3678.258065  4036.806452   

                2009         2010         2011         2012         2016  \
Cluster                                                                    
0           0.000000     0.000000     0.000000     0.000000     0.000000   
1        4705.741935  8778.548387  5915.032258  6209.290323  8964.774194   

               2017          2018          2019         2020         2021  \
Cluster                                                                     
0           0.00000      0.000000      0.000000     0.000000   144.000000   
1        9710.83871  10560.580645  11212.193548  1746.580645  5762.096774   

                 2022          2023          2024  
Cluster                                            
0          616.000000   1774.500000   3131.500000  
1        14099.032258  14091.032258  15049.258065

7.Transforms¶

Clustering Countries by Pilgrim Patterns Using K-Means and PCA¶

Result¶

Try 2. Select K based on the Silhouette score¶