在 Debian 上使用 PyTorch 进行自然语言处理的实操指南
一 环境准备与安装
二 GPU 支持与加速
三 快速上手示例
pip install torchtext
python - <<‘PY’ import torch from torchtext.datasets import IMDB from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator from torch.nn.utils.rnn import pad_sequence import torch.nn as nn from torch.utils.data import DataLoader
tokenizer = get_tokenizer(“spacy”, language=“en_core_web_sm”)
def yield_tokens(data_iter):
for _, text in data_iter:
yield tokenizer(text)
train_iter, test_iter = IMDB(split=(“train”, “test”))
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=[“
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1
def collate_batch(batch):
texts, labels = [], []
for label, text in batch:
texts.append(torch.tensor(text_pipeline(text), dtype=torch.int64))
labels.append(label_pipeline(label))
texts = pad_sequence(texts, padding_value=vocab[“
class TextClassifier(nn.Module):
def init(self, vocab_size, embed_dim, hidden_dim, num_class):
super().init()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=vocab[“
BATCH_SIZE, EMBED_DIM, HIDDEN_DIM, NUM_CLASS, EPOCHS = 64, 128, 256, 2, 2 train_loader = DataLoader(list(train_iter), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch) test_loader = DataLoader(list(test_iter), batch_size=BATCH_SIZE, collate_fn=collate_batch) device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”) model = TextClassifier(len(vocab), EMBED_DIM, HIDDEN_DIM, NUM_CLASS).to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) model.train() for epoch in range(EPOCHS): total_loss = 0 for texts, labels in train_loader: texts, labels = texts.to(device), labels.to(device) optimizer.zero_grad() logits = model(texts) loss = criterion(logits, labels) loss.backward() optimizer.step() total_loss += loss.item() print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}“) model.eval() correct, total = 0, 0 with torch.no_grad(): for texts, labels in test_loader: texts, labels = texts.to(device), labels.to(device) preds = model(texts).argmax(dim=1) correct += (preds == labels).sum().item() total += labels.size(0) print(f"Accuracy: {correct/total:.4f}”) PY 以上两个示例分别展示了在 Debian 上使用 Transformers 进行句向量提取,以及用 torchtext + LSTM 完成文本分类的完整流程。
四 常见问题与优化建议