← Back to problem

SOLUTION — read only after attempting. This page contains a full reference CNN pipeline, illustrative numbers, and answers to the 3 theory questions. If you have not yet sat the 180-minute mock, close this page and start there.

Mock B · Reference solution

A complete CNN pipeline for “Hand-drawn shape classification”, plus answers to the theory questions. The illustrative numbers below come from a single seeded run on a Colab L4; your own run will differ slightly. All accuracy / loss numbers are labeled [illustrative].

Full pipeline (~120 lines PyTorch)

import os, random, numpy as np, pandas as pd, torch
from pathlib import Path
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision import transforms
from sklearn.metrics import confusion_matrix, classification_report

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CLASSES = ["circle", "square", "triangle", "star", "arrow"]
CLS2IDX = {c: i for i, c in enumerate(CLASSES)}

# ---------- Data loader (rubric: 15 pts) ----------
class ShapeDS(Dataset):
    def __init__(self, df, root="shapes", tfm=None, has_label=True):
        self.df = df.reset_index(drop=True); self.root = Path(root)
        self.tfm = tfm; self.has_label = has_label
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        row = self.df.iloc[i]
        img = Image.open(self.root / f"{int(row['id']):05d}.png").convert("L")
        if self.tfm: img = self.tfm(img)
        if self.has_label:
            return img, CLS2IDX[row["label"]]
        return img, int(row["id"])

train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

# val split inside the 4000 train (NOT touching the 1000 test)
val_idx = train_df.sample(frac=0.1, random_state=SEED).index
tr_idx  = train_df.index.difference(val_idx)

NORM = transforms.Normalize(mean=[0.1], std=[0.25])  # estimated on train

train_tfm = transforms.Compose([
    transforms.RandomAffine(degrees=15, translate=(0.05, 0.05), scale=(0.9, 1.1)),
    transforms.ToTensor(), NORM,
])
score_tfm = transforms.Compose([transforms.ToTensor(), NORM])

ds_tr  = ShapeDS(train_df.loc[tr_idx],  tfm=train_tfm)
ds_val = ShapeDS(train_df.loc[val_idx], tfm=score_tfm)
ds_te  = ShapeDS(test_df,               tfm=score_tfm, has_label=False)

dl_tr  = DataLoader(ds_tr,  batch_size=128, shuffle=True,  num_workers=2)
dl_val = DataLoader(ds_val, batch_size=256, shuffle=False, num_workers=2)
dl_te  = DataLoader(ds_te,  batch_size=256, shuffle=False, num_workers=2)

# ---------- Baseline CNN (rubric: 30 pts) ----------
class SmallCNN(nn.Module):
    def __init__(self, n_classes=5):
        super().__init__()
        def block(ci, co):
            return nn.Sequential(
                nn.Conv2d(ci, co, 3, padding=1, bias=False),
                nn.BatchNorm2d(co), nn.ReLU(inplace=True),
                nn.Conv2d(co, co, 3, padding=1, bias=False),
                nn.BatchNorm2d(co), nn.ReLU(inplace=True),
                nn.MaxPool2d(2),
            )
        self.feat = nn.Sequential(block(1, 32), block(32, 64), block(64, 128))
        self.head = nn.Sequential(
            nn.AdaptiveAvgPool2d(1), nn.Flatten(),
            nn.Dropout(0.3), nn.Linear(128, n_classes),
        )
    def forward(self, x): return self.head(self.feat(x))

model = SmallCNN().to(DEVICE)
n_params = sum(p.numel() for p in model.parameters())
print("params:", n_params)  # ~230k — well under the 500k budget

opt   = torch.optim.AdamW(model.parameters(), lr=2e-3, weight_decay=1e-4)
sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=15)
loss_fn = nn.CrossEntropyLoss()

def score_on(loader):
    model.train(False)  # inference mode
    ys, ps = [], []
    with torch.no_grad():
        for x, y in loader:
            x = x.to(DEVICE); logits = model(x)
            ps.append(logits.argmax(1).cpu()); ys.append(y)
    return torch.cat(ys), torch.cat(ps)

# ---------- Training stability (rubric: 15 pts) ----------
best_val, best_state, patience, bad = 0.0, None, 4, 0
for epoch in range(15):
    model.train(True)
    for x, y in dl_tr:
        x, y = x.to(DEVICE), y.to(DEVICE)
        opt.zero_grad()
        loss = loss_fn(model(x), y)
        loss.backward(); opt.step()
    sched.step()
    yv, pv = score_on(dl_val)
    acc = (yv == pv).float().mean().item()
    print(f"epoch {epoch:02d}  val acc {acc:.3f}")
    if acc > best_val:
        best_val, best_state, bad = acc, {k: v.clone() for k, v in model.state_dict().items()}, 0
    else:
        bad += 1
        if bad >= patience: break

model.load_state_dict(best_state)
# [illustrative] best val acc ≈ 0.965 after ~10 epochs

# ---------- Evaluation report (rubric: 10 pts) ----------
yv, pv = score_on(dl_val)
print(classification_report(yv, pv, target_names=CLASSES, digits=3))
print(confusion_matrix(yv, pv))

# ---------- Submission ----------
model.train(False)
rows = []
with torch.no_grad():
    for x, ids in dl_te:
        x = x.to(DEVICE)
        preds = model(x).argmax(1).cpu().numpy()
        for i, p in zip(ids.numpy(), preds):
            rows.append((int(i), CLASSES[p]))
pd.DataFrame(rows, columns=["id", "label"]).to_csv("predictions.csv", index=False)

Illustrative numbers from a single L4 run: best val accuracy ~0.965 around epoch 10, test macro accuracy ~0.96 [illustrative]. A no-augmentation baseline on the same architecture lands ~0.93 [illustrative]; augmentation buys the last ~3 points.

Rubric-by-rubric check

SectionPointsWhere the pipeline earns it
Data loader15Custom ShapeDS, Normalize(mean=[0.1], std=[0.25]), val split drawn from the 4 000 train (no leakage from the 1 000 test).
Baseline CNN30Three conv blocks (32 / 64 / 128 channels), ~230 k params (under the 500 k budget), 15-epoch training, val accuracy reported each epoch.
Augmentation20RandomAffine(degrees=15, translate=0.05, scale=0.9–1.1), with a measured ~3 pt val-accuracy delta vs. no augmentation [illustrative].
Training stability15Seeded RNGs, fixed batch size 128, cosine LR schedule, early stopping with patience 4, best-state restore.
Scoring report10classification_report + confusion matrix on val; arrow typically confuses with triangle [illustrative].
Theory short-answer10See below.

Theory short-answer — reference answers

1. Receptive field of Conv3 → Pool2 → Conv3 → Pool2 → Conv3

Walk forward from the output back to the input, tracking receptive-field size r and effective stride s:

So one output neuron sees a 17 × 17 patch of the 64 × 64 input. That's comfortably larger than any single shape (radius ≤ 24, so diameter ≤ 48; the AdaptiveAvgPool head integrates information across the whole feature map anyway).

2. BatchNorm at training vs. inference

During training, BatchNorm normalizes each channel using the current mini-batch's mean and variance, and simultaneously updates exponentially-averaged running statistics. During inference, it uses those frozen running statistics instead — this is what model.train(False) toggles.

If you forget to switch into inference mode before predicting on the test set, two concrete things go wrong: (a) predictions depend on the batch composition — reordering or batching the test set differently changes the output, which is obviously bad for reproducibility; (b) for very small test batches (or batch size 1), the per-batch variance is degenerate, the normalization output explodes, and accuracy collapses by tens of points.

3. Why rotation augmentation helps here — and when to remove it

Helps because: the generator already rotates each shape by ±25° at draw time, but the training set only contains one such draw per index. RandomAffine at training time exposes the network to a richer distribution of orientations than the 4 000 fixed training images contain, which closes the gap to the test distribution (which is drawn from the same generator and so contains rotated examples not in the training set). It also acts as a regularizer on shape-orientation correlations the network would otherwise memorize.

Remove it when the task is orientation-dependent: e.g. classifying digits (6 vs. 9), arrows by direction (left vs. right), or street-sign categories where rotation changes semantics. In our dataset, arrow is the borderline case — if the rubric had asked for arrow direction rather than “is an arrow”, you'd disable rotation (and horizontal flip).

Common mistakes

Compare your work

Tick each item.