Linux下PyTorch性能调优指南
一 环境配置与基础检查
二 数据加载与系统级优化
三 训练加速与显存优化
四 显存诊断与常见瓶颈处理
五 可复用的优化清单与最小示例
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.cuda.amp import autocast, GradScaler
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = nn.Linear(1024, 1024).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scaler = GradScaler()
# 模拟数据
x = torch.randn(1024, 1024, device=device)
y = torch.randn(1024, 1024, device=device)
loader = DataLoader(TensorDataset(x, y), batch_size=256, shuffle=True, num_workers=4, pin_memory=True)
accumulation_steps = 4
for epoch in range(3):
for i, (inputs, targets) in enumerate(loader, 1):
with autocast():
outputs = model(inputs)
loss = criterion(outputs, targets) / accumulation_steps
scaler.scale(loss).backward()
if i % accumulation_steps == 0:
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad(set_to_none=True)
if i % 50 == 0:
alloc = torch.cuda.memory_allocated() / 1024**2
resv = torch.cuda.memory_reserved() / 1024**2
print(f"[step {i}] alloc {alloc:.1f} MB | resv {resv:.1f} MB")
以上示例展示了 AMP、梯度累积 与 显存监控 的组合用法;在真实任务中可按需加入 torch.compile、DDP 与 检查点。