Ubuntu上PyTorch训练实用技巧
一 环境配置与基础优化
二 数据加载与预处理加速
三 训练加速与显存优化
import torch, torch.nn as nn, torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
model = model.cuda()
optimizer = optim.SGD(model.parameters(), lr=1e-3)
scaler = GradScaler()
for x, y in dataloader:
x, y = x.cuda(non_blocking=True), y.cuda(non_blocking=True)
optimizer.zero_grad()
with autocast():
out = model(x)
loss = criterion(out, y)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
# 终端:torchrun --nproc_per_node=2 train.py
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
dist.init_process_group("nccl")
local_rank = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local_rank)
model = model.to(local_rank)
model = DDP(model, device_ids=[local_rank])
以上做法覆盖 AMP、DP/DDP 选择与 GPU 可见性控制,适配 Ubuntu 上的常见训练场景。四 性能分析与瓶颈定位
with torch.profiler.profile(
schedule=torch.profiler.schedule(wait=1, warmup=1, active=3),
on_trace_ready=lambda prof: prof.export_chrome_trace("./log/trace.json"),
record_shapes=True, with_stack=True
) as prof:
for x, y in dataloader:
out = model(x.cuda())
loss = criterion(out, y.cuda())
loss.backward()
optimizer.step()
optimizer.zero_grad()
prof.step()
五 模型压缩与部署准备