#!/usr/bin/env python3
"""
Minimal JAX MNIST trainer (JAX + matplotlib + Python stdlib only).
Downloads MNIST from https://raw.githubusercontent.com/fgnt/mnist/master
"""

import gzip
import struct
import urllib.request

import jax
import jax.numpy as jnp
from jax import random, jit, grad
import matplotlib.pyplot as plt


# --------------------- Download + parse MNIST (IDX format) ---------------------
SERVER = "https://raw.githubusercontent.com/fgnt/mnist/master"
FILES = {
    "train_images": "train-images-idx3-ubyte.gz",
    "train_labels": "train-labels-idx1-ubyte.gz",
    "test_images":  "t10k-images-idx3-ubyte.gz",
    "test_labels":  "t10k-labels-idx1-ubyte.gz",
}

def fetch_bytes(path):
    with urllib.request.urlopen(f"{SERVER}/{path}") as r:
        return r.read()

def parse_idx_images(gz_bytes):
    data = gzip.decompress(gz_bytes)
    magic, n, rows, cols = struct.unpack(">IIII", data[:16])
    assert magic == 2051, f"Bad magic for images: {magic}"
    # Read the remaining bytes as uint8, then normalize to [0,1] and flatten.
    arr = jnp.frombuffer(data, dtype=jnp.uint8, offset=16)
    arr = arr.reshape((n, rows * cols)).astype(jnp.float32) / 255.0
    return arr

def parse_idx_labels(gz_bytes):
    data = gzip.decompress(gz_bytes)
    magic, n = struct.unpack(">II", data[:8])
    assert magic == 2049, f"Bad magic for labels: {magic}"
    lab = jnp.frombuffer(data, dtype=jnp.uint8, offset=8).astype(jnp.int32)
    return lab

def load_mnist():
    trX = parse_idx_images(fetch_bytes(FILES["train_images"]))
    trY = parse_idx_labels(fetch_bytes(FILES["train_labels"]))
    teX = parse_idx_images(fetch_bytes(FILES["test_images"]))
    teY = parse_idx_labels(fetch_bytes(FILES["test_labels"]))
    return trX, trY, teX, teY


# ------------------------------- Tiny JAX MLP ---------------------------------
def init_params(key, d_in=784, d_hidden=128, d_out=10):
    k1, k2 = random.split(key)
    W1 = random.normal(k1, (d_in, d_hidden)) * jnp.sqrt(2.0 / d_in)
    b1 = jnp.zeros((d_hidden,))
    W2 = random.normal(k2, (d_hidden, d_out)) * jnp.sqrt(2.0 / d_hidden)
    b2 = jnp.zeros((d_out,))
    return {"W1": W1, "b1": b1, "W2": W2, "b2": b2}

def forward(params, x):
    h = jax.nn.relu(x @ params["W1"] + params["b1"])
    return h @ params["W2"] + params["b2"]  # logits

@jit
def predict(params, x):
    return jnp.argmax(forward(params, x), axis=-1)

@jit
def cross_entropy_loss(params, x, y):
    logits = forward(params, x)
    y_one = jax.nn.one_hot(y, num_classes=logits.shape[-1])
    logp = jax.nn.log_softmax(logits)
    return -jnp.mean(jnp.sum(y_one * logp, axis=-1))

@jit
def accuracy(params, x, y):
    return jnp.mean(predict(params, x) == y)

@jit
def sgd_step(params, x, y, lr):
    grads = grad(cross_entropy_loss)(params, x, y)
    return {k: params[k] - lr * grads[k] for k in params}


# ------------------------------ Training helpers ------------------------------
def train(train_X, train_y, test_X, test_y, epochs=5, batch_size=128, lr=0.1, seed=0):
    key = random.PRNGKey(seed)
    params = init_params(key)
    n = train_X.shape[0]

    for epoch in range(1, epochs + 1):
        key, sk = random.split(key)
        perm = random.permutation(sk, n)
        for i in range(0, n, batch_size):
            idx = perm[i:i + batch_size]
            params = sgd_step(params, train_X[idx], train_y[idx], lr)

        # quick epoch metrics on small subsets (for speed)
        tr_loss = cross_entropy_loss(params, train_X[:2000], train_y[:2000])
        tr_acc  = accuracy(params, train_X[:2000], train_y[:2000])
        te_acc  = accuracy(params, test_X, test_y)
        print(f"Epoch {epoch:2d} | loss {float(tr_loss):.4f} | "
              f"train_acc {float(tr_acc):.4f} | test_acc {float(te_acc):.4f}")
    return params

def show_samples(X, y, params=None, ncols=8):
    fig, axes = plt.subplots(1, ncols, figsize=(ncols * 1.4, 1.8))
    for i in range(ncols):
        img = X[i].reshape(28, 28)
        axes[i].imshow(img, cmap="gray", interpolation="nearest")
        title = f"label={int(y[i])}"
        if params is not None:
            pred = int(predict(params, X[i:i+1])[0])
            title += f"\npred={pred}"
        axes[i].set_title(title, fontsize=8)
        axes[i].axis("off")
    plt.tight_layout()
    plt.show()


# ------------------------------------ Main ------------------------------------
def main():
    print("Downloading + loading MNIST...")
    train_X, train_y, test_X, test_y = load_mnist()
    print("Train:", train_X.shape, train_y.shape, "| Test:", test_X.shape, test_y.shape)

    print("Showing a few training samples (ground-truth labels)...")
    show_samples(train_X, train_y, params=None)

    print("Training...")
    params = train(train_X, train_y, test_X, test_y, epochs=5, batch_size=128, lr=0.1)

    print("Showing samples with model predictions (fits)...")
    show_samples(train_X, train_y, params=params)

main()

import gzip
import struct
import urllib.request

import jax
import jax.numpy as jnp
from jax import random, jit, grad
import matplotlib.pyplot as plt

SERVER = "https://raw.githubusercontent.com/fgnt/mnist/master"
FILES = {
    "train_images": "train-images-idx3-ubyte.gz",
    "train_labels": "train-labels-idx1-ubyte.gz",
    "test_images":  "t10k-images-idx3-ubyte.gz",
    "test_labels":  "t10k-labels-idx1-ubyte.gz",
}

def fetch_bytes(path):
    with urllib.request.urlopen(f"{SERVER}/{path}") as r:
        return r.read()

def parse_idx_images(gz_bytes):
    data = gzip.decompress(gz_bytes) #unzip
    magic, n, rows, cols = struct.unpack(">IIII", data[:16]) #reads first 16-bytes as 4 unsigned integers
    assert magic == 2051, f"Bad magic for images: {magic}" #check that it is an image file  
    arr = jnp.frombuffer(data, dtype=jnp.uint8, offset=16) #interprets the remaining bytes as unsigned 8-bit intergers (pixel values 0 to 255)
    arr = arr.reshape((n, rows * cols)).astype(jnp.float32) / 255.0 #each image converted to a 1D vector with length 28x28=784...converts to float 32 and divide by 255 to arrive at either 0 or 1 (normalizing)
    return arr

def parse_idx_labels(gz_bytes):
    data = gzip.decompress(gz_bytes) #decompresses
    magic, n = struct.unpack(">II", data[:8]) #reads 8-bytes header
    assert magic == 2049, f"Bad magic for labels: {magic}" #check that it is a header file
    lab = jnp.frombuffer(data, dtype=jnp.uint8, offset=8).astype(jnp.int32) #interprets remaining bytes as integers from 0 to 9 (digit labels) 
    return lab

def load_mnist():
    trX = parse_idx_images(fetch_bytes(FILES["train_images"]))
    trY = parse_idx_labels(fetch_bytes(FILES["train_labels"]))
    teX = parse_idx_images(fetch_bytes(FILES["test_images"]))
    teY = parse_idx_labels(fetch_bytes(FILES["test_labels"]))
    return trX, trY, teX, teY

def init_params(key, d_in=784, d_hidden=128, d_out=10): #784 datapoints, 128 hidden layers, 10 output layers
    k1, k2 = random.split(key) # split the randomly generated PRNG key into 2
    W1 = random.normal(k1, (d_in, d_hidden)) * jnp.sqrt(2.0 / d_in) #weight matrix input > hidden...random normal values scaled by sqrt(2/fan_in)
    b1 = jnp.zeros((d_hidden,)) 
    W2 = random.normal(k2, (d_hidden, d_out)) * jnp.sqrt(2.0 / d_hidden) #weight matrix hidden > output...random normal values scaled by sqrt(2/fan_in)
    b1 = jnp.zeros((d_hidden,)) #bias for hidden neuron...init = 0
    b2 = jnp.zeros((d_out,)) #bias for output class...init = 0
    return {"W1": W1, "b1": b1, "W2": W2, "b2": b2}

def forward(params, x):
    h = jax.nn.relu(x @ params["W1"] + params["b1"]) #apply ReLU at first layer, x is input size (784)
    return h @ params["W2"] + params["b2"]  # gives logits (raw scores, not probabilities), output size = output categories

@jit #compile for speed
def predict(params, x):
    return jnp.argmax(forward(params, x), axis=-1) #runs forward & picks the class with the largest logit

@jit
def cross_entropy_loss(params, x, y):
    logits = forward(params, x)
    y_one = jax.nn.one_hot(y, num_classes=logits.shape[-1])
    logp = jax.nn.log_softmax(logits)
    return -jnp.mean(jnp.sum(y_one * logp, axis=-1))

@jit
def accuracy(params, x, y):
    return jnp.mean(predict(params, x) == y)

@jit
def sgd_step(params, x, y, lr):
    grads = grad(cross_entropy_loss)(params, x, y)
    return {k: params[k] - lr * grads[k] for k in params} #adjust each parameter in the direction that reduces the loss

def train(train_X, train_y, test_X, test_y, epochs=5, batch_size=128, lr=0.1, seed=0):
    key = random.PRNGKey(seed)
    params = init_params(key)
    n = train_X.shape[0]

    for epoch in range(1, epochs + 1):
        key, sk = random.split(key)
        perm = random.permutation(sk, n)

        for i in range(0, n, batch_size):
            idx = perm[i:i + batch_size]
            params = sgd_step(params, train_X[idx], train_y[idx], lr)

        # quick epoch metrics on small subsets (for speed)
        tr_loss = cross_entropy_loss(params, train_X[:2000], train_y[:2000])
        tr_acc  = accuracy(params, train_X[:2000], train_y[:2000])
        te_acc  = accuracy(params, test_X, test_y)
        print(f"Epoch {epoch:2d} | loss {float(tr_loss):.4f} | "
              f"train_acc {float(tr_acc):.4f} | test_acc {float(te_acc):.4f}")
    return params

def show_samples(X, y, params=None, ncols=8):
    fig, axes = plt.subplots(1, ncols, figsize=(ncols * 1.4, 1.8))
    for i in range(ncols):
        img = X[i].reshape(28, 28)
        axes[i].imshow(img, cmap="gray", interpolation="nearest")
        title = f"label={int(y[i])}"
        if params is not None:
            pred = int(predict(params, X[i:i+1])[0])
            title += f"\npred={pred}"
        axes[i].set_title(title, fontsize=8)
        axes[i].axis("off")
    plt.tight_layout()
    plt.show()

Research > Machine Learning¶

Deconstructing Neil's Machine Learning Code¶

This is Neil's full code...¶

Code Deconstruction > Imports & Descriptions¶

Training Loop¶

Visualizing Samples¶