import numpy as np
import matplotlib.pyplot as plt
import sklearn
import pandas as pd

np.set_printoptions(precision=1)

df = pd.read_csv("datasets/fabacademy_commit_weekly_summary.csv")
week_cols = [f"week_{i:02d}" for i in range(1, 31)]   # week_01 … week_30
X = df[week_cols].values
y = df["year"].values

print(f"Fab Academy weekly commit data shape (records, weeks): {X.shape}")

#
# plot week1 and week10
#
plt.scatter(X[:,0], X[:,9], c=y)
plt.xlabel("week_01 commits")
plt.ylabel("week_10 commits")
plt.title("Fab Academy: week_01 vs week_10")
plt.colorbar(label="year")
plt.show()

#
# standardize (zero mean, unit variance) to eliminate dependence on data scaling
#
scaler = sklearn.preprocessing.StandardScaler()
Xscale = scaler.fit_transform(X)

#
# do 10 component PCA
#
pca = sklearn.decomposition.PCA(n_components=10)
pca.fit(Xscale)
print(f"explained variance: {pca.explained_variance_}")
Xpca = pca.transform(Xscale)

#
# plot vs first two PCA components
#
plt.scatter(Xpca[:,0], Xpca[:,1], c=y, s=8)
plt.xlabel("principal component 1")
plt.ylabel("principal component 2")
plt.title("Fab Academy: weekly commits vs PCA components")
plt.colorbar(label="year")
plt.show()

Fab Academy weekly commit data shape (records, weeks): (1509, 30)

explained variance: [8.5 2.8 1.9 1.7 1.2 1.  1.  1.  0.9 0.8]

#
# use PCA components for K-means (PC1, PC2)
#
x = Xpca[:,0]
y = Xpca[:,1]

#
# k-means parameters
#
nclusters = 5
nsteps = 10
np.random.seed(0)

#
# choose starting points
#
indices = np.random.uniform(low=0, high=len(x), size=nclusters).astype(int)
mux = x[indices]
muy = y[indices]

#
# plot before iteration
#
fig, ax = plt.subplots()
plt.plot(x, y, '.')
vor = Voronoi(np.stack((mux, muy), axis=1))
voronoi_plot_2d(vor, ax=ax, show_points=True, show_vertices=False, point_size=20)
plt.title('before k-means iterations (PCA space)')
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

#
# do k-means iteration 
#
for i in range(nsteps):
    xm = np.outer(x, np.ones(len(mux)))
    ym = np.outer(y, np.ones(len(muy)))
    muxm = np.outer(np.ones(len(x)), mux)
    muym = np.outer(np.ones(len(x)), muy)
    
    distances = np.sqrt((xm - muxm)**2 + (ym - muym)**2)
    mins = np.argmin(distances, axis=1)

    for k in range(len(mux)):
        index = np.where(mins == k)
        if len(index[0]) > 0:
            mux[k] = np.sum(x[index]) / len(index[0])
            muy[k] = np.sum(y[index]) / len(index[0])

#
# plot after 
#
fig, ax = plt.subplots()
plt.plot(x, y, '.')
vor = Voronoi(np.stack((mux, muy), axis=1))
voronoi_plot_2d(vor, ax=ax, show_points=True, show_vertices=False, point_size=20)
plt.title('after k-means iterations (PCA space)')
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

#
# assign cluster IDs
#
cluster_ids = mins
df["pca_kmeans_cluster"] = cluster_ids
df.to_csv("datasets/fabacademy_pca_kmeans5.csv", index=False)

print("cluster counts:", np.bincount(cluster_ids))
print("first few cluster assignments:", cluster_ids[:20])

cluster counts: [ 110   33   70 1210  580]
first few cluster assignments: [3 3 3 0 4 3 3 3 3 3 3 3 0 4 0 1 0 0 4 3]

!pip install torch torchvision

Requirement already satisfied: torch in /opt/conda/lib/python3.13/site-packages (2.9.1)
Requirement already satisfied: torchvision in /opt/conda/lib/python3.13/site-packages (0.24.1)
Requirement already satisfied: filelock in /opt/conda/lib/python3.13/site-packages (from torch) (3.20.0)
Requirement already satisfied: typing-extensions>=4.10.0 in /opt/conda/lib/python3.13/site-packages (from torch) (4.15.0)
Requirement already satisfied: setuptools in /opt/conda/lib/python3.13/site-packages (from torch) (80.9.0)
Requirement already satisfied: sympy>=1.13.3 in /opt/conda/lib/python3.13/site-packages (from torch) (1.14.0)
Requirement already satisfied: networkx>=2.5.1 in /opt/conda/lib/python3.13/site-packages (from torch) (3.5)
Requirement already satisfied: jinja2 in /opt/conda/lib/python3.13/site-packages (from torch) (3.1.6)
Requirement already satisfied: fsspec>=0.8.5 in /opt/conda/lib/python3.13/site-packages (from torch) (2025.9.0)
Requirement already satisfied: numpy in /opt/conda/lib/python3.13/site-packages (from torchvision) (2.3.3)
Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /opt/conda/lib/python3.13/site-packages (from torchvision) (11.3.0)
Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/conda/lib/python3.13/site-packages (from sympy>=1.13.3->torch) (1.3.0)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.13/site-packages (from jinja2->torch) (3.0.3)

import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from pathlib import Path

transform = transforms.Compose([
    transforms.Grayscale(),     # 上から実行
    #transforms.Resize((32,32)), # MNISTにあわせたけど、もともと３２でつくりなおそうかな
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])


class FabAcademyCommitDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.files = list(Path(root_dir).glob("*.jpg"))
        self.transform = transform

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        img = Image.open(self.files[idx])
        if self.transform:
            img = self.transform(img)
        return img, 0

trainset = FabAcademyCommitDataset(
    root_dir="datasets/commitsImage2",
    transform=transform
)

batch_size = 4

trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2) #オリジナルのでよさそう
#trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0)

inputs, labels = next(iter(trainloader))
print("BATCH SHAPE:", inputs.shape)  #1508/4  bactch=4

testset = trainset   # あとでテストをつくる > 来週のために少しとっておくか
testloader = trainloader

# ---　Define a Convolutional Neural Network ---

import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 3, 5)   # 
        self.pool  = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(3, 6, 5)
        self.fc1   = nn.Linear(6 * 13 * 13, 120)
        self.fc2   = nn.Linear(120, 84)
        self.fc3   = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net()

print(net)

# --- feature map hook  ---
feature_maps = {}

def register_activation_hook(name):
    def hook(module, input, output):
        feature_maps[name] = output.detach()
    return hook

net.conv1.register_forward_hook(register_activation_hook("conv1"))
net.conv2.register_forward_hook(register_activation_hook("conv2"))


# --- Define a Loss function and optimizer ---

import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)


# --- Train the network ---
print("LEN trainloader:", len(trainloader))

for epoch in range(2): # loop over the dataset multiple times
#for epoch in range(1): # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        #print("LOOP = ", i)
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 10 == 0:
            print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / (i+1):.3f}")
            running_loss = 0.0

print("Finished Training")
# --- save model ---
torch.save(net.state_dict(), "fabacademy_cnn.pth")

# --- Visualize feature map ---
import matplotlib.pyplot as plt

def visualize_feature_map(name):
    fmap = feature_maps[name]           
    fmap = fmap.squeeze(0).cpu().numpy()   # [C, H, W]

    num_channels = fmap.shape[0]
    cols = 8
    rows = (num_channels + cols - 1) // cols

    fig = plt.figure(figsize=(16, 2*rows))
    for i in range(num_channels):
        ax = fig.add_subplot(rows, cols, i+1)
        ax.imshow(fmap[i], cmap='gray')
        ax.axis("off")

    plt.suptitle(f"{name} feature maps")
    plt.show()
    plt.close(fig)


visualize_feature_map("conv1")
visualize_feature_map("conv2")

BATCH SHAPE: torch.Size([4, 1, 64, 64])
Net(
  (conv1): Conv2d(1, 3, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=1014, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)
LEN trainloader: 378
[1,     1] loss: 2.345
[1,    11] loss: 2.095
[1,    21] loss: 1.024
[1,    31] loss: 0.603
[1,    41] loss: 0.225
[1,    51] loss: 0.004
[1,    61] loss: 0.000
[1,    71] loss: 0.000
[1,    81] loss: 0.000
[1,    91] loss: 0.000
[1,   101] loss: 0.000
[1,   111] loss: 0.000
[1,   121] loss: 0.000
[1,   131] loss: 0.000
[1,   141] loss: 0.000
[1,   151] loss: 0.000
[1,   161] loss: 0.000
[1,   171] loss: 0.000
[1,   181] loss: 0.000
[1,   191] loss: 0.000
[1,   201] loss: 0.000
[1,   211] loss: 0.000
[1,   221] loss: 0.000
[1,   231] loss: 0.000
[1,   241] loss: 0.000
[1,   251] loss: 0.000
[1,   261] loss: 0.000
[1,   271] loss: 0.000
[1,   281] loss: 0.000
[1,   291] loss: 0.000
[1,   301] loss: 0.000
[1,   311] loss: 0.000
[1,   321] loss: 0.000
[1,   331] loss: 0.000
[1,   341] loss: 0.000
[1,   351] loss: 0.000
[1,   361] loss: 0.000
[1,   371] loss: 0.000
[2,     1] loss: 0.000
[2,    11] loss: 0.000
[2,    21] loss: 0.000
[2,    31] loss: 0.000
[2,    41] loss: 0.000
[2,    51] loss: 0.000
[2,    61] loss: 0.000
[2,    71] loss: 0.000
[2,    81] loss: 0.000
[2,    91] loss: 0.000
[2,   101] loss: 0.000
[2,   111] loss: 0.000
[2,   121] loss: 0.000
[2,   131] loss: 0.000
[2,   141] loss: 0.000
[2,   151] loss: 0.000
[2,   161] loss: 0.000
[2,   171] loss: 0.000
[2,   181] loss: 0.000
[2,   191] loss: 0.000
[2,   201] loss: 0.000
[2,   211] loss: 0.000
[2,   221] loss: 0.000
[2,   231] loss: 0.000
[2,   241] loss: 0.000
[2,   251] loss: 0.000
[2,   261] loss: 0.000
[2,   271] loss: 0.000
[2,   281] loss: 0.000
[2,   291] loss: 0.000
[2,   301] loss: 0.000
[2,   311] loss: 0.000
[2,   321] loss: 0.000
[2,   331] loss: 0.000
[2,   341] loss: 0.000
[2,   351] loss: 0.000
[2,   361] loss: 0.000
[2,   371] loss: 0.000
Finished Training

# --- PCA ---

loaded_net = Net()
loaded_net.load_state_dict(torch.load("fabacademy_cnn.pth"))
loaded_net.eval()

feature_maps_pca = {}

def register_pca_hook(name):
    def hook(module, input, output):
        pooled = loaded_net.pool(output)   # ★ conv2 output を pool する
        feature_maps_pca[name] = pooled.detach()
    return hook

loaded_net.conv2.register_forward_hook(register_pca_hook("conv2"))

trainloader = DataLoader(trainset, batch_size=1, shuffle=False)

features = []

for inputs, _ in trainloader:
    _ = loaded_net(inputs)
    fmap = feature_maps_pca["conv2"]
    fmap = fmap.squeeze(0).cpu().numpy()
    flat = fmap.reshape(-1)
    features.append(flat)

import numpy as np
X = np.array(features)
y = np.arange(len(X))
print("X shape:", X.shape)

plt.scatter(X[:,0], X[:,1], c=y)
plt.xlabel("feature_0")
plt.ylabel("feature_1")
plt.title("conv2 feature space (two dims)")
plt.colorbar()
plt.show()

import sklearn

# standardize (zero mean, unit variance) to eliminate dependence on data scaling
scaler = sklearn.preprocessing.StandardScaler()
Xscale = scaler.fit_transform(X)

# do 10 component PCA
pca = sklearn.decomposition.PCA(n_components=10)
pca.fit(Xscale)
print("explained variance:", pca.explained_variance_)

Xpca = pca.transform(Xscale)

plt.scatter(Xpca[:,0], Xpca[:,1], c=y, s=8)
plt.xlabel("principal component 1")
plt.ylabel("principal component 2")
plt.title("Feature PCA")
plt.colorbar()
plt.show()

x = Xpca[:,0]
y = Xpca[:,1]

# k-means parameters
nclusters = 5
nsteps = 10
np.random.seed(0)

indices = np.random.uniform(low=0, high=len(x), size=nclusters).astype(int)
mux = x[indices]
muy = y[indices]

fig, ax = plt.subplots()
plt.plot(x, y, '.')
from scipy.spatial import Voronoi, voronoi_plot_2d
vor = Voronoi(np.stack((mux, muy), axis=1))
voronoi_plot_2d(vor, ax=ax, show_points=True, show_vertices=False, point_size=20)
plt.title('before k-means iterations (PCA space)')
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

for i in range(nsteps):
    xm = np.outer(x, np.ones(len(mux)))
    ym = np.outer(y, np.ones(len(muy)))
    muxm = np.outer(np.ones(len(x)), mux)
    muym = np.outer(np.ones(len(x)), muy)

    distances = np.sqrt((xm - muxm)**2 + (ym - muym)**2)
    mins = np.argmin(distances, axis=1)

    for k in range(len(mux)):
        index = np.where(mins == k)
        if len(index[0]) > 0:
            mux[k] = np.sum(x[index]) / len(index[0])
            muy[k] = np.sum(y[index]) / len(index[0])

fig, ax = plt.subplots()
plt.plot(x, y, '.')
vor = Voronoi(np.stack((mux, muy), axis=1))
voronoi_plot_2d(vor, ax=ax, show_points=True, show_vertices=False, point_size=20)
plt.title('after k-means iterations (PCA space)')
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

cluster_ids = mins
np.savetxt("conv2_feature_pca_kmeans5.csv", cluster_ids, fmt="%d")
print("cluster counts:", np.bincount(cluster_ids))
print("first few cluster assignments:", cluster_ids[:20])

X shape: (1509, 1014)

explained variance: [191.71822  154.36792  102.93614   75.39953   50.625072  48.00827
  35.49985   28.029173  21.681234  21.270788]

cluster counts: [365  66 354 359 365]
first few cluster assignments: [4 3 4 2 0 4 0 0 4 4 0 2 3 3 2 1 0 0 0 3]

Class¶

- Transforms ¶

Assignment　　¶

Approach¶

１. Principal Components Analysis (PCA) with FAb Academy Git commit data 　　¶

２. CNN > PCA　 > k-means with FAb Academy Git commit image data　　　¶

Class¶

- Transforms¶

Assignment ¶

Approach¶

１. Principal Components Analysis (PCA) with FAb Academy Git commit data ¶

２. CNN > PCA > k-means with FAb Academy Git commit image data ¶

- Transforms ¶

Assignment　　¶

１. Principal Components Analysis (PCA) with FAb Academy Git commit data 　　¶

２. CNN > PCA　 > k-means with FAb Academy Git commit image data　　　¶