Linux 上 PyTorch 并行计算实践指南
一 并行范式与适用场景
二 环境准备与快速检查
nvidia-smi 查看驱动与 GPU 状态;按需安装 CUDA Toolkit 并设置环境变量(如 PATH、LD_LIBRARY_PATH、CUDA_HOME)。conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia。三 实践范例
范例一 单机多卡 DDP 最小骨架(torchrun)
# train_ddp.py
import os, torch, torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader, DistributedSampler
from torchvision.models import resnet18
import torchvision.transforms as T
from torchvision.datasets import CIFAR10
def main():
local_rank = int(os.environ["LOCAL_RANK"])
torch.cuda.set_device(local_rank)
dist.init_process_group(backend="nccl", init_method="env://")
model = resnet18(num_classes=10).to(local_rank)
model = DDP(model, device_ids=[local_rank])
transform = T.Compose([T.ToTensor(), T.Normalize((0.4914,0.4822,0.4465),(0.2023,0.1994,0.2010))])
train_set = CIFAR10(root="./data", train=True, download=True, transform=transform)
sampler = DistributedSampler(train_set)
loader = DataLoader(train_set, batch_size=128, sampler=sampler, num_workers=4, pin_memory=True)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
criterion = torch.nn.CrossEntropyLoss()
model.train()
for epoch in range(3):
sampler.set_epoch(epoch) # 保证每轮shuffle不同
for x, y in loader:
x, y = x.to(local_rank, non_blocking=True), y.to(local_rank, non_blocking=True)
optimizer.zero_grad()
out = model(x)
loss = criterion(out, y)
loss.backward()
optimizer.step()
if local_rank == 0:
print(f"Epoch {epoch} done.")
dist.destroy_process_group()
if __name__ == "__main__":
main()
启动方式(单机 4 卡):
torchrun --nproc_per_node=4 train_ddp.py
要点:使用 DistributedSampler、设置 set_epoch、pin_memory=True、non_blocking 传输、主进程(rank 0)打印与保存。
范例二 多机多卡启动(两节点示例)
torchrun --nproc_per_node=4 --nnodes=2 --node_rank=0 --master_addr=192.168.1.1 --master_port=12345 train_ddp.pytorchrun --nproc_per_node=4 --nnodes=2 --node_rank=1 --master_addr=192.168.1.1 --master_port=12345 train_ddp.py
说明:--nproc_per_node 为每节点 GPU 数,--nnodes 为节点总数,--node_rank 为节点编号,--master_addr/port 为主节点地址与端口。范例三 数据加载与 CPU 并行
from torch.utils.data import DataLoader, Dataset
class MyDataset(Dataset):
def __len__(self): return 10000
def __getitem__(self, idx):
import torch
return torch.randn(3, 32, 32), torch.randint(0, 10, (1,)).item()
loader = DataLoader(
MyDataset(),
batch_size=256,
num_workers=8, # 多进程预取
pin_memory=True, # 加速 Host->Device 拷贝
persistent_workers=True # 长驻 worker 减少重建开销
)
提示:I/O 与预处理密集时增大 num_workers;注意系统 CPU 核心/内存 与磁盘带宽,避免资源争用。
四 性能优化与常见坑
num_workers 设为 CPU 核心数 的约数,开启 pin_memory 与 non_blocking 传输。per_gpu_batch * world_size,学习率通常按 world_size 线性或按规则缩放。set_epoch 导致每轮样本重复;未用 DistributedSampler 导致数据重复或遗漏;多进程重复保存/打印;环境变量 CUDA_VISIBLE_DEVICES 与启动方式不一致。