In [ ]:
Class¶
Assignment ¶
- Analyze your data set
- prepare a notebook with the analysis of your data set, store it in your repo, and call it presentation.ipynb
- include a 1920x1080 summary slide describing you, your data, and your analysis, store it your repo's images folder, and call it presentation.png
Approach¶
1. Principal Components Analysis (PCA) with FAb Academy Git commit data ¶
Based on the PCA example from class, I applied PCA to the weekly Git commit data of Fab Academy students. Labels were not required for PCA itself, but I used the students’ year as Y for visualization purposes.
In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import pandas as pd
np.set_printoptions(precision=1)
df = pd.read_csv("datasets/fabacademy_commit_weekly_summary.csv")
week_cols = [f"week_{i:02d}" for i in range(1, 31)] # week_01 … week_30
X = df[week_cols].values
y = df["year"].values
print(f"Fab Academy weekly commit data shape (records, weeks): {X.shape}")
#
# plot week1 and week10
#
plt.scatter(X[:,0], X[:,9], c=y)
plt.xlabel("week_01 commits")
plt.ylabel("week_10 commits")
plt.title("Fab Academy: week_01 vs week_10")
plt.colorbar(label="year")
plt.show()
#
# standardize (zero mean, unit variance) to eliminate dependence on data scaling
#
scaler = sklearn.preprocessing.StandardScaler()
Xscale = scaler.fit_transform(X)
#
# do 10 component PCA
#
pca = sklearn.decomposition.PCA(n_components=10)
pca.fit(Xscale)
print(f"explained variance: {pca.explained_variance_}")
Xpca = pca.transform(Xscale)
#
# plot vs first two PCA components
#
plt.scatter(Xpca[:,0], Xpca[:,1], c=y, s=8)
plt.xlabel("principal component 1")
plt.ylabel("principal component 2")
plt.title("Fab Academy: weekly commits vs PCA components")
plt.colorbar(label="year")
plt.show()
Fab Academy weekly commit data shape (records, weeks): (1509, 30)
explained variance: [8.5 2.8 1.9 1.7 1.2 1. 1. 1. 0.9 0.8]
Using the PCA components, I performed k-means clustering as in Week 3_2.
In [10]:
#
# use PCA components for K-means (PC1, PC2)
#
x = Xpca[:,0]
y = Xpca[:,1]
#
# k-means parameters
#
nclusters = 5
nsteps = 10
np.random.seed(0)
#
# choose starting points
#
indices = np.random.uniform(low=0, high=len(x), size=nclusters).astype(int)
mux = x[indices]
muy = y[indices]
#
# plot before iteration
#
fig, ax = plt.subplots()
plt.plot(x, y, '.')
vor = Voronoi(np.stack((mux, muy), axis=1))
voronoi_plot_2d(vor, ax=ax, show_points=True, show_vertices=False, point_size=20)
plt.title('before k-means iterations (PCA space)')
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()
#
# do k-means iteration
#
for i in range(nsteps):
xm = np.outer(x, np.ones(len(mux)))
ym = np.outer(y, np.ones(len(muy)))
muxm = np.outer(np.ones(len(x)), mux)
muym = np.outer(np.ones(len(x)), muy)
distances = np.sqrt((xm - muxm)**2 + (ym - muym)**2)
mins = np.argmin(distances, axis=1)
for k in range(len(mux)):
index = np.where(mins == k)
if len(index[0]) > 0:
mux[k] = np.sum(x[index]) / len(index[0])
muy[k] = np.sum(y[index]) / len(index[0])
#
# plot after
#
fig, ax = plt.subplots()
plt.plot(x, y, '.')
vor = Voronoi(np.stack((mux, muy), axis=1))
voronoi_plot_2d(vor, ax=ax, show_points=True, show_vertices=False, point_size=20)
plt.title('after k-means iterations (PCA space)')
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()
#
# assign cluster IDs
#
cluster_ids = mins
df["pca_kmeans_cluster"] = cluster_ids
df.to_csv("datasets/fabacademy_pca_kmeans5.csv", index=False)
print("cluster counts:", np.bincount(cluster_ids))
print("first few cluster assignments:", cluster_ids[:20])
cluster counts: [ 110 33 70 1210 580] first few cluster assignments: [3 3 3 0 4 3 3 3 3 3 3 3 0 4 0 1 0 0 4 3]
2. CNN > PCA > k-means with FAb Academy Git commit image data ¶
Using the features extracted by the CNN in Week 2-2 from the Fab Academy Git commit data, I applied a CNN → PCA → k-means pipeline to perform clustering. I asked ChatGPT how to apply PCA to the features I extracted in Week 2, and added that step to my workflow.
In [2]:
!pip install torch torchvision
Requirement already satisfied: torch in /opt/conda/lib/python3.13/site-packages (2.9.1) Requirement already satisfied: torchvision in /opt/conda/lib/python3.13/site-packages (0.24.1) Requirement already satisfied: filelock in /opt/conda/lib/python3.13/site-packages (from torch) (3.20.0) Requirement already satisfied: typing-extensions>=4.10.0 in /opt/conda/lib/python3.13/site-packages (from torch) (4.15.0) Requirement already satisfied: setuptools in /opt/conda/lib/python3.13/site-packages (from torch) (80.9.0) Requirement already satisfied: sympy>=1.13.3 in /opt/conda/lib/python3.13/site-packages (from torch) (1.14.0) Requirement already satisfied: networkx>=2.5.1 in /opt/conda/lib/python3.13/site-packages (from torch) (3.5) Requirement already satisfied: jinja2 in /opt/conda/lib/python3.13/site-packages (from torch) (3.1.6) Requirement already satisfied: fsspec>=0.8.5 in /opt/conda/lib/python3.13/site-packages (from torch) (2025.9.0) Requirement already satisfied: numpy in /opt/conda/lib/python3.13/site-packages (from torchvision) (2.3.3) Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /opt/conda/lib/python3.13/site-packages (from torchvision) (11.3.0) Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/conda/lib/python3.13/site-packages (from sympy>=1.13.3->torch) (1.3.0) Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.13/site-packages (from jinja2->torch) (3.0.3)
In [35]:
import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from pathlib import Path
transform = transforms.Compose([
transforms.Grayscale(), # 上から実行
#transforms.Resize((32,32)), # MNISTにあわせたけど、もともと32でつくりなおそうかな
transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,))
])
class FabAcademyCommitDataset(Dataset):
def __init__(self, root_dir, transform=None):
self.root_dir = root_dir
self.files = list(Path(root_dir).glob("*.jpg"))
self.transform = transform
def __len__(self):
return len(self.files)
def __getitem__(self, idx):
img = Image.open(self.files[idx])
if self.transform:
img = self.transform(img)
return img, 0
trainset = FabAcademyCommitDataset(
root_dir="datasets/commitsImage2",
transform=transform
)
batch_size = 4
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2) #オリジナルのでよさそう
#trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0)
inputs, labels = next(iter(trainloader))
print("BATCH SHAPE:", inputs.shape) #1508/4 bactch=4
testset = trainset # あとでテストをつくる > 来週のために少しとっておくか
testloader = trainloader
# --- Define a Convolutional Neural Network ---
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 3, 5) #
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(3, 6, 5)
self.fc1 = nn.Linear(6 * 13 * 13, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
print(net)
# --- feature map hook ---
feature_maps = {}
def register_activation_hook(name):
def hook(module, input, output):
feature_maps[name] = output.detach()
return hook
net.conv1.register_forward_hook(register_activation_hook("conv1"))
net.conv2.register_forward_hook(register_activation_hook("conv2"))
# --- Define a Loss function and optimizer ---
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
# --- Train the network ---
print("LEN trainloader:", len(trainloader))
for epoch in range(2): # loop over the dataset multiple times
#for epoch in range(1): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
#print("LOOP = ", i)
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 10 == 0:
print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / (i+1):.3f}")
running_loss = 0.0
print("Finished Training")
# --- save model ---
torch.save(net.state_dict(), "fabacademy_cnn.pth")
# --- Visualize feature map ---
import matplotlib.pyplot as plt
def visualize_feature_map(name):
fmap = feature_maps[name]
fmap = fmap.squeeze(0).cpu().numpy() # [C, H, W]
num_channels = fmap.shape[0]
cols = 8
rows = (num_channels + cols - 1) // cols
fig = plt.figure(figsize=(16, 2*rows))
for i in range(num_channels):
ax = fig.add_subplot(rows, cols, i+1)
ax.imshow(fmap[i], cmap='gray')
ax.axis("off")
plt.suptitle(f"{name} feature maps")
plt.show()
plt.close(fig)
visualize_feature_map("conv1")
visualize_feature_map("conv2")
BATCH SHAPE: torch.Size([4, 1, 64, 64]) Net( (conv1): Conv2d(1, 3, kernel_size=(5, 5), stride=(1, 1)) (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (conv2): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1)) (fc1): Linear(in_features=1014, out_features=120, bias=True) (fc2): Linear(in_features=120, out_features=84, bias=True) (fc3): Linear(in_features=84, out_features=10, bias=True) ) LEN trainloader: 378 [1, 1] loss: 2.345 [1, 11] loss: 2.095 [1, 21] loss: 1.024 [1, 31] loss: 0.603 [1, 41] loss: 0.225 [1, 51] loss: 0.004 [1, 61] loss: 0.000 [1, 71] loss: 0.000 [1, 81] loss: 0.000 [1, 91] loss: 0.000 [1, 101] loss: 0.000 [1, 111] loss: 0.000 [1, 121] loss: 0.000 [1, 131] loss: 0.000 [1, 141] loss: 0.000 [1, 151] loss: 0.000 [1, 161] loss: 0.000 [1, 171] loss: 0.000 [1, 181] loss: 0.000 [1, 191] loss: 0.000 [1, 201] loss: 0.000 [1, 211] loss: 0.000 [1, 221] loss: 0.000 [1, 231] loss: 0.000 [1, 241] loss: 0.000 [1, 251] loss: 0.000 [1, 261] loss: 0.000 [1, 271] loss: 0.000 [1, 281] loss: 0.000 [1, 291] loss: 0.000 [1, 301] loss: 0.000 [1, 311] loss: 0.000 [1, 321] loss: 0.000 [1, 331] loss: 0.000 [1, 341] loss: 0.000 [1, 351] loss: 0.000 [1, 361] loss: 0.000 [1, 371] loss: 0.000 [2, 1] loss: 0.000 [2, 11] loss: 0.000 [2, 21] loss: 0.000 [2, 31] loss: 0.000 [2, 41] loss: 0.000 [2, 51] loss: 0.000 [2, 61] loss: 0.000 [2, 71] loss: 0.000 [2, 81] loss: 0.000 [2, 91] loss: 0.000 [2, 101] loss: 0.000 [2, 111] loss: 0.000 [2, 121] loss: 0.000 [2, 131] loss: 0.000 [2, 141] loss: 0.000 [2, 151] loss: 0.000 [2, 161] loss: 0.000 [2, 171] loss: 0.000 [2, 181] loss: 0.000 [2, 191] loss: 0.000 [2, 201] loss: 0.000 [2, 211] loss: 0.000 [2, 221] loss: 0.000 [2, 231] loss: 0.000 [2, 241] loss: 0.000 [2, 251] loss: 0.000 [2, 261] loss: 0.000 [2, 271] loss: 0.000 [2, 281] loss: 0.000 [2, 291] loss: 0.000 [2, 301] loss: 0.000 [2, 311] loss: 0.000 [2, 321] loss: 0.000 [2, 331] loss: 0.000 [2, 341] loss: 0.000 [2, 351] loss: 0.000 [2, 361] loss: 0.000 [2, 371] loss: 0.000 Finished Training
In [36]:
# --- PCA ---
loaded_net = Net()
loaded_net.load_state_dict(torch.load("fabacademy_cnn.pth"))
loaded_net.eval()
feature_maps_pca = {}
def register_pca_hook(name):
def hook(module, input, output):
pooled = loaded_net.pool(output) # ★ conv2 output を pool する
feature_maps_pca[name] = pooled.detach()
return hook
loaded_net.conv2.register_forward_hook(register_pca_hook("conv2"))
trainloader = DataLoader(trainset, batch_size=1, shuffle=False)
features = []
for inputs, _ in trainloader:
_ = loaded_net(inputs)
fmap = feature_maps_pca["conv2"]
fmap = fmap.squeeze(0).cpu().numpy()
flat = fmap.reshape(-1)
features.append(flat)
import numpy as np
X = np.array(features)
y = np.arange(len(X))
print("X shape:", X.shape)
plt.scatter(X[:,0], X[:,1], c=y)
plt.xlabel("feature_0")
plt.ylabel("feature_1")
plt.title("conv2 feature space (two dims)")
plt.colorbar()
plt.show()
import sklearn
# standardize (zero mean, unit variance) to eliminate dependence on data scaling
scaler = sklearn.preprocessing.StandardScaler()
Xscale = scaler.fit_transform(X)
# do 10 component PCA
pca = sklearn.decomposition.PCA(n_components=10)
pca.fit(Xscale)
print("explained variance:", pca.explained_variance_)
Xpca = pca.transform(Xscale)
plt.scatter(Xpca[:,0], Xpca[:,1], c=y, s=8)
plt.xlabel("principal component 1")
plt.ylabel("principal component 2")
plt.title("Feature PCA")
plt.colorbar()
plt.show()
x = Xpca[:,0]
y = Xpca[:,1]
# k-means parameters
nclusters = 5
nsteps = 10
np.random.seed(0)
indices = np.random.uniform(low=0, high=len(x), size=nclusters).astype(int)
mux = x[indices]
muy = y[indices]
fig, ax = plt.subplots()
plt.plot(x, y, '.')
from scipy.spatial import Voronoi, voronoi_plot_2d
vor = Voronoi(np.stack((mux, muy), axis=1))
voronoi_plot_2d(vor, ax=ax, show_points=True, show_vertices=False, point_size=20)
plt.title('before k-means iterations (PCA space)')
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()
for i in range(nsteps):
xm = np.outer(x, np.ones(len(mux)))
ym = np.outer(y, np.ones(len(muy)))
muxm = np.outer(np.ones(len(x)), mux)
muym = np.outer(np.ones(len(x)), muy)
distances = np.sqrt((xm - muxm)**2 + (ym - muym)**2)
mins = np.argmin(distances, axis=1)
for k in range(len(mux)):
index = np.where(mins == k)
if len(index[0]) > 0:
mux[k] = np.sum(x[index]) / len(index[0])
muy[k] = np.sum(y[index]) / len(index[0])
fig, ax = plt.subplots()
plt.plot(x, y, '.')
vor = Voronoi(np.stack((mux, muy), axis=1))
voronoi_plot_2d(vor, ax=ax, show_points=True, show_vertices=False, point_size=20)
plt.title('after k-means iterations (PCA space)')
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()
cluster_ids = mins
np.savetxt("conv2_feature_pca_kmeans5.csv", cluster_ids, fmt="%d")
print("cluster counts:", np.bincount(cluster_ids))
print("first few cluster assignments:", cluster_ids[:20])
X shape: (1509, 1014)
explained variance: [191.71822 154.36792 102.93614 75.39953 50.625072 48.00827 35.49985 28.029173 21.681234 21.270788]
cluster counts: [365 66 354 359 365] first few cluster assignments: [4 3 4 2 0 4 0 0 4 4 0 2 3 3 2 1 0 0 0 3]
In [ ]: