Part A · Common ML/DL pitfalls
Fourteen bugs that we have seen in actual student submissions for similar competitions. Each is
framed as a wrong example, a one-line reason, and the corrected version. The order is roughly
"most common" to "most subtle" — the first six cost the most points.
1. Data leakage — fitting the scaler before splitting
# BAD: scaler sees test rows during fit; statistics leak from test to train
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X_scaled = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)
Why it fails. The mean and std used to scale the train set were computed using
test rows. Validation accuracy is optimistically biased — every model evaluated this way looks
better than it really is, and the bias is non-uniform across folds.
# GOOD: split first, fit on train only, transform train + test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
2. Validation-set contamination — tuning hyperparameters on the test set
# BAD: the "test" set is being used to pick hyperparameters; it is no longer a test set
for lr in [1e-2, 1e-3, 1e-4]:
model = train(X_train, y_train, lr=lr)
score = model.score(X_test, y_test)
print(lr, score)
best_lr = ... # picked by reading the line above
Why it fails. Every hyperparameter you select by looking at X_test
smuggles information from the test set into the model. Your reported test score is now an
optimistic training-time metric.
# GOOD: split into train / val / test; tune on val, report on test exactly once
X_tr, X_tmp, y_tr, y_tmp = train_test_split(X, y, test_size=0.3, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_tmp, y_tmp, test_size=0.5, random_state=0)
best_lr, best_score = None, -1
for lr in [1e-2, 1e-3, 1e-4]:
m = train(X_tr, y_tr, lr=lr)
s = m.score(X_val, y_val)
if s > best_score: best_lr, best_score = lr, s
final = train(X_tr, y_tr, lr=best_lr)
print("test:", final.score(X_test, y_test)) # touched once, at the end
3. Forgetting model.train(False) at inference
# BAD: BatchNorm uses batch stats, Dropout zeroes units at inference time
model = MyNet().to(device)
load_weights(model, "best.pt")
with torch.no_grad():
preds = model(X_test_tensor) # model is still in training mode
Why it fails. BatchNorm in training mode uses the current batch's
statistics, which on small or single-sample batches is wildly off. Dropout stays
active, randomly zeroing predictions. Validation accuracy plunges with no apparent cause. The
idiomatic toggle is model.train(False), with model.train(True) to flip
back before the next training epoch.
# GOOD: flip the module out of training mode before any inference pass
model.train(False)
with torch.no_grad():
preds = model(X_test_tensor)
# ... and back to training mode at the top of the next training epoch
model.train(True)
4. Forgetting torch.no_grad() at inference
# BAD: autograd graph is built for every val batch; memory grows until OOM
model.train(False)
val_loss = 0.0
for xb, yb in val_dl:
out = model(xb.to(device)) # graph retained, activations kept
val_loss += loss_fn(out, yb.to(device)).item()
Why it fails. Even with requires_grad=False on parameters, intermediate
activations are held alive because PyTorch assumes you might call .backward().
Long validation loops then trigger CUDA OOM.
# GOOD: disable the graph for the duration of evaluation
model.train(False)
val_loss = 0.0
with torch.no_grad():
for xb, yb in val_dl:
out = model(xb.to(device))
val_loss += loss_fn(out, yb.to(device)).item()
5. Wrong shuffle on the validation / test DataLoader
# BAD: shuffling val/test breaks ordered submissions and inflates metric variance
val_dl = DataLoader(val_ds, batch_size=256, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=256, shuffle=True)
Why it fails. The submission CSV must be in the original test-set order; shuffling
silently misaligns predictions with ids. Even for plain metrics, shuffling adds nothing and only
complicates reproducibility.
# GOOD: shuffle ONLY the training loader
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=256, shuffle=False)
test_dl = DataLoader(test_ds, batch_size=256, shuffle=False)
6. Random seeds set in only one place
# BAD: NumPy and Python's random module are still nondeterministic
import torch
torch.manual_seed(0)
Why it fails. Augmentation pipelines often call random.random();
train/val splits often call np.random; CUDA kernels are nondeterministic by default.
Three independent RNGs need three independent seeds.
# GOOD: seed all four sources and turn on cuDNN determinism
import os, random, numpy as np, torch
def set_seed(seed=0):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ["PYTHONHASHSEED"] = str(seed)
set_seed(0)
7. Class imbalance ignored — accuracy lies
# BAD: on a 95/5 dataset, "always predict 0" gets 95% accuracy
print("accuracy:", (preds == y).mean())
Why it fails. Accuracy on imbalanced data is dominated by the majority class.
The model can be trivial and still look great. Always report per-class precision / recall, or
use balanced accuracy / F1 / ROC-AUC, and consider class weights or resampling.
# GOOD: compute balanced metrics and weight the loss by inverse class frequency
from sklearn.metrics import classification_report, balanced_accuracy_score
print(classification_report(y, preds, digits=4))
print("balanced acc:", balanced_accuracy_score(y, preds))
counts = torch.bincount(y_train_tensor)
weights = (1.0 / counts.float())
weights = weights / weights.sum() * len(counts)
loss_fn = torch.nn.CrossEntropyLoss(weight=weights.to(device))
8. Learning rate too aggressive — loss goes to NaN
# BAD: default-ish LR for a deep net with no warmup; loss explodes around step 100
opt = torch.optim.AdamW(model.parameters(), lr=1e-2)
Why it fails. A too-high LR pushes weights into a regime where activations
saturate or explode; gradients become huge; the next step over-corrects further. The fix is a
smaller LR, plus gradient clipping. To find a good LR systematically, sweep LR from 1e-7 to 1
across a few hundred steps and plot loss vs LR — the steepest descending region is the safe band.
# GOOD: smaller LR + warmup + gradient clipping; cosine decay afterwards
from torch.optim.lr_scheduler import OneCycleLR
opt = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-2)
sched = OneCycleLR(opt, max_lr=3e-4, total_steps=len(train_dl) * EPOCHS)
for xb, yb in train_dl:
opt.zero_grad()
loss = loss_fn(model(xb), yb)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
opt.step()
sched.step()
9. Optimizer step() before zero_grad()
# BAD: gradients from previous batches still live in .grad; step uses their sum
for xb, yb in train_dl:
loss = loss_fn(model(xb), yb)
loss.backward()
opt.step()
opt.zero_grad() # too late — already stepped on accumulated grads
Why it fails. PyTorch accumulates gradients on each
backward(). If you do not zero before backward, batch k takes an update
using g_1 + g_2 + ... + g_k. The model trains in the wrong direction and almost
always diverges.
# GOOD: zero first, then forward / backward / step
for xb, yb in train_dl:
opt.zero_grad()
loss = loss_fn(model(xb), yb)
loss.backward()
opt.step()
10. Augmentation applied to val / test
# BAD: same transform on every split; val metric becomes noisy and pessimistic
from torchvision import transforms as T
tfm = T.Compose([T.RandomHorizontalFlip(), T.RandomCrop(32, padding=4),
T.ToTensor(), T.Normalize(MEAN, STD)])
train_ds = CIFAR10(..., transform=tfm)
val_ds = CIFAR10(..., transform=tfm) # bug
test_ds = CIFAR10(..., transform=tfm) # bug
Why it fails. Augmentation is a regulariser, not part of the data distribution.
Applying it at inference time means you are testing on a randomised, distorted version of the data —
metrics jitter run-to-run and submissions become non-reproducible.
# GOOD: train uses augmentation; val/test uses only the deterministic preprocessing
train_tfm = T.Compose([T.RandomHorizontalFlip(), T.RandomCrop(32, padding=4),
T.ToTensor(), T.Normalize(MEAN, STD)])
plain_tfm = T.Compose([T.ToTensor(), T.Normalize(MEAN, STD)])
train_ds = CIFAR10(..., transform=train_tfm)
val_ds = CIFAR10(..., transform=plain_tfm)
test_ds = CIFAR10(..., transform=plain_tfm)
11. ReduceLROnPlateau with the wrong mode
# BAD: tracking validation accuracy in 'min' mode means LR drops when acc improves
sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", patience=3)
for epoch in range(N):
val_acc = score_on_val(model, val_dl)
sched.step(val_acc) # treats higher acc as "worse"
Why it fails. The scheduler is direction-aware. In 'min' mode it
drops the LR when the metric stops going down. If you feed accuracy (which should go up), every
improvement looks like a plateau, so the LR is gradually starved out of the training run.
# GOOD: match mode to the metric direction — accuracy goes UP, loss goes DOWN
sched_acc = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="max", patience=3)
sched_loss = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min", patience=3)
12. Saving only the model weights, no optimizer state
# BAD: resume picks weights but starts Adam moments from zero; training stutters
torch.save(model.state_dict(), "ckpt.pt")
# ... later ...
model.load_state_dict(torch.load("ckpt.pt"))
opt = torch.optim.AdamW(model.parameters(), lr=3e-4) # fresh optimizer state
Why it fails. Adam-family optimizers carry first- and second-moment estimates
per parameter. Reinitialising those after a checkpoint is equivalent to a soft warm restart —
the loss jumps for a few hundred steps and your cosine schedule is now out of sync.
# GOOD: checkpoint everything needed to resume EXACTLY where you left off
torch.save({
"epoch": epoch,
"model_state_dict": model.state_dict(),
"optimizer_state_dict": opt.state_dict(),
"scheduler_state_dict": sched.state_dict(),
"best_val": best_val,
}, "ckpt.pt")
ck = torch.load("ckpt.pt", map_location=device)
model.load_state_dict(ck["model_state_dict"])
opt.load_state_dict(ck["optimizer_state_dict"])
sched.load_state_dict(ck["scheduler_state_dict"])
start_epoch = ck["epoch"] + 1
13. Train / val normalization-statistic mismatch
# BAD: different MEAN/STD for train vs val; model sees a different distribution
train_tfm = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
plain_tfm = T.Normalize(mean=[0.5, 0.5, 0.5 ], std=[0.5, 0.5, 0.5 ])
Why it fails. The model learns features under the train-time normalisation.
Feeding it inference-time inputs with different per-channel stats shifts every activation off
distribution. Accuracy can drop 10–30 points and you will blame the architecture.
# GOOD: single source of truth for normalization stats, reused everywhere
NORM = T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
train_tfm = T.Compose([..., T.ToTensor(), NORM])
plain_tfm = T.Compose([T.ToTensor(), NORM])
14. Optimizer built before model.to(device)
# BAD: optimizer holds references to CPU tensors; .to('cuda') later orphans them
model = MyNet()
opt = torch.optim.AdamW(model.parameters(), lr=3e-4)
model.to("cuda") # parameters get new GPU tensors; opt still tracks CPU ones
Why it fails. model.to(device) rebinds parameter storage. The
optimizer captured the old (CPU) parameter objects and now updates tensors the forward pass no
longer uses. Loss does not change between epochs and there is no error.
# GOOD: move to device first, then build the optimizer
model = MyNet().to("cuda")
opt = torch.optim.AdamW(model.parameters(), lr=3e-4)
Part B · Colab / Kaggle playbook
The first time you run a training job on a free GPU you will lose 30 minutes to runtime issues.
The second time you will lose 10. This section is meant to take that down to zero.
Choosing a runtime — T4 vs L4 vs A100
Colab exposes (depending on tier and availability) three NVIDIA GPUs you are likely to encounter:
T4 (16 GB, the free-tier default — Turing, no native bf16, fine for small CNNs
and MLPs up to about ResNet-50 at batch 64), L4 (24 GB, Ada Lovelace — much
better bf16 support, roughly 2x T4 throughput on transformer workloads), and A100
(40 GB, Ampere — flagship class, for training transformers from scratch or running large-batch
fine-tunes). For USAAIO-scale work the T4 is almost always enough; reach for L4 when you actually
bottleneck on memory or epoch time. Kaggle's equivalents are a P100, two T4s, or a TPU v3-8.
To switch runtime in Colab: Runtime → Change runtime type → Hardware accelerator, then
pick the GPU class. Changing the runtime always restarts the kernel — mount Drive and re-import
after switching.
Verifying the GPU is actually attached
!nvidia-smi # prints GPU model + memory usage
import torch
print(torch.cuda.is_available()) # True
print(torch.cuda.get_device_name(0)) # e.g. 'Tesla T4' or 'NVIDIA L4'
print(torch.cuda.get_device_capability(0)) # compute capability tuple
print(torch.cuda.mem_get_info()) # (free_bytes, total_bytes)
If torch.cuda.is_available() returns False, the runtime is CPU-only — the menu
selection did not take effect, or quota was exceeded. Restart runtime and re-check.
Persistent storage — mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
# now /content/drive/MyDrive is your Drive root
import os
CKPT_DIR = '/content/drive/MyDrive/usaaio/checkpoints'
os.makedirs(CKPT_DIR, exist_ok=True)
Anything written under /content/ dies with the runtime; anything under
/content/drive/MyDrive/ survives. Keep raw data on Drive only if it is small —
Drive I/O is slow. For multi-GB datasets, copy once into the local SSD at runtime start:
!cp /content/drive/MyDrive/usaaio/data.tar /content/data.tar
!tar -xf /content/data.tar -C /content/ # fast local reads from here
Saving and resuming checkpoints
import os, torch
def save_checkpoint(path, model, opt, sched, epoch, best_val):
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': opt.state_dict(),
'scheduler_state_dict': sched.state_dict(),
'best_val': best_val,
}, path)
def resume_if_exists(path, model, opt, sched, device):
if not os.path.exists(path):
return 0, float('inf')
ck = torch.load(path, map_location=device)
model.load_state_dict(ck['model_state_dict'])
opt.load_state_dict(ck['optimizer_state_dict'])
sched.load_state_dict(ck['scheduler_state_dict'])
return ck['epoch'] + 1, ck['best_val']
start_epoch, best_val = resume_if_exists(
os.path.join(CKPT_DIR, 'last.pt'), model, opt, sched, device)
for epoch in range(start_epoch, EPOCHS):
# ... train one epoch ...
save_checkpoint(os.path.join(CKPT_DIR, 'last.pt'),
model, opt, sched, epoch, best_val)
Save every epoch, not just the best one — Colab disconnects randomly and "best so far"
is useless if you cannot resume the schedule. Keep both: a rolling last.pt and a
sticky best.pt updated only when validation improves.
Auto-disconnect mitigation
Colab disconnects idle browser tabs after about 90 minutes; the runtime itself recycles after 12
hours. A common community workaround is a small JavaScript snippet pasted into the browser
console that programmatically clicks the connect button every few minutes:
// open browser DevTools console on the Colab tab, paste this, hit enter
function ColabKeepAlive() {
const btn = document.querySelector("colab-toolbar-button#connect");
if (btn) btn.click();
}
setInterval(ColabKeepAlive, 60 * 1000); // every 60 seconds
Gray area. This violates the spirit of Colab's free-tier idle policy and your
notebook may be flagged. Use it only for personal training runs you can babysit.
USAAIO Round 2 is proctored and runs in a controlled environment — anything
that simulates user activity is out of bounds there. The point of this snippet is to let you
actually finish self-study training runs at home.
Tracking GPU memory
import gc, torch
def gpu_mem():
alloc = torch.cuda.memory_allocated() / 1024**2
rsvd = torch.cuda.memory_reserved() / 1024**2
peak = torch.cuda.max_memory_allocated() / 1024**2
return f"alloc {alloc:6.1f} MB | reserved {rsvd:6.1f} MB | peak {peak:6.1f} MB"
print(gpu_mem())
# after a model dies / OOM, clear the cache before retrying
del model, opt
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
empty_cache() does not free tensors — it returns cached free blocks to the driver
so nvidia-smi stops over-reporting usage. To actually free, drop Python references
first (del + gc.collect()), then call empty_cache.
Reproducibility checklist
- Seed Python's
random, NumPy, PyTorch CPU, and PyTorch CUDA — all four.
- Set
torch.backends.cudnn.deterministic = True and benchmark = False.
- Use
generator=torch.Generator().manual_seed(seed) in DataLoader for reproducible shuffling.
- Pin every dependency version in a
requirements.txt cell at the top of the notebook.
- Log the git commit hash (if any) and the runtime GPU model alongside metrics.
- Save the seed inside the checkpoint so resume runs are bitwise-reproducible.
Profiling slow code
# cell magics — quick wins for an isolated slow block
%%time
result = expensive_function(X)
%%timeit -n 5 -r 3
expensive_function(X)
# cProfile — see which function calls dominate
import cProfile, pstats
pr = cProfile.Profile()
pr.enable()
train_one_epoch(model, train_dl, opt, loss_fn, device)
pr.disable()
pstats.Stats(pr).sort_stats('cumulative').print_stats(20)
# PyTorch profiler — per-op timings on CPU + CUDA, with stack traces
from torch.profiler import profile, ProfilerActivity
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
record_shapes=True) as prof:
for i, (xb, yb) in zip(range(20), train_dl):
opt.zero_grad()
loss = loss_fn(model(xb.to(device)), yb.to(device))
loss.backward()
opt.step()
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=15))
Submission template
Round-2-style submissions are typically a CSV with two columns, sorted by id, no trailing blank
line. The function below takes a trained model and an unshuffled test DataLoader (that yields
(id_tensor, image_tensor) tuples) and writes a clean file every time.
import csv, torch
def write_submission(model, test_dl, device, out_path,
id_col='id', pred_col='prediction'):
model.train(False)
rows = []
with torch.no_grad():
for ids, xb in test_dl:
xb = xb.to(device, non_blocking=True)
logits = model(xb)
preds = logits.argmax(dim=-1).cpu().tolist()
ids = ids.cpu().tolist()
rows.extend(zip(ids, preds))
rows.sort(key=lambda r: r[0]) # ensure ordered output
# newline='' + lineterminator='\n' guarantees no spurious blank lines
with open(out_path, 'w', newline='') as f:
writer = csv.writer(f, lineterminator='\n')
writer.writerow([id_col, pred_col])
writer.writerows(rows)
print(f"wrote {len(rows)} rows to {out_path}")
write_submission(model, test_dl, device, '/content/submission.csv')
Before uploading, sanity-check by reading the file back with pandas.read_csv and
asserting (a) row count equals expected test-set size, (b) the id column is a permutation of the
sample-submission ids, (c) prediction values are inside the legal label set.