Debian调试PyTorch程序的实用流程
一 快速定位常见错误
import pdb; pdb.set_trace()
python -m pdb train.py
import ipdb; ipdb.set_trace()
import faulthandler
faulthandler.enable()
二 数据与训练过程可视化
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('runs/exp1')
writer.add_scalar('Loss/train', loss.item(), global_step=epoch)
writer.close()
# 终端启动:
tensorboard --logdir=runs
with torch.profiler.profile(
schedule=torch.profiler.schedule(wait=1, warmup=1, active=3),
on_trace_ready=lambda prof: prof.export_chrome_trace("trace.json"),
record_shapes=True
) as prof:
for step, (x, y) in enumerate(train_loader):
...
loss.backward()
optimizer.step()
prof.step()
# 之后用 Chrome 的 chrome://tracing 或 TensorBoard 加载查看
提示:训练循环中务必调用 prof.step(),否则时间线不完整。三 深入调试与离线分析
pip install torchsnooper
import torchsnooper
@torchsnooper.snoop()
def forward(x):
return x.view(-1, 10)
pip install viztracer
# 全局跟踪
viztracer my_script.py
# 或在代码块中跟踪
from viztracer import VizTracer
with VizTracer(log_torch=True) as tracer:
train_one_epoch(...)
四 外部调试器与系统级工具
gdb -p $(pgrep -f python)
(gdb) bt
sudo perf record -g python train.py
sudo perf report
五 Debian环境配置与排错清单
python3 -m venv venv
source venv/bin/activate
pip install torch torchvision torchaudio
sudo apt update
sudo apt install gdb valgrind linux-perf python3-pip
pip install ipdb torchsnooper viztracer