文本分类原理讲解(类比手写数字识别)
核心思想对比:
- 手写数字识别:28×28像素图像 → 784维向量 → 神经网络 → 10个数字类别
- 文本分类:单词序列 → 词向量 → 神经网络 → 文本类别
简单文本分类Demo:电影评论情感分析
python
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
import re
# 1. 数据准备(类似MNIST数据加载)
class TextDataset(Dataset):
def __init__(self, texts, labels, vocab, max_length=20):
self.texts = texts
self.labels = labels
self.vocab = vocab
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
# 文本转数字序列(类似图像像素转向量)
tokens = self.text_to_tokens(text)
tokens = tokens[:self.max_length] # 截断
tokens += [0] * (self.max_length - len(tokens)) # 填充
return torch.tensor(tokens, dtype=torch.long), torch.tensor(label, dtype=torch.long)
def text_to_tokens(self, text):
# 简单分词和清洗
words = re.findall(r'\b\w+\b', text.lower())
return [self.vocab.get(word, 1) for word in words] # 1是未知词标记
# 2. 构建词汇表(类似MNIST的像素归一化)
def build_vocab(texts, min_freq=2):
word_counts = Counter()
for text in texts:
words = re.findall(r'\b\w+\b', text.lower())
word_counts.update(words)
# 创建词汇表:word -> index
vocab = {'<PAD>': 0, '<UNK>': 1}
idx = 2
for word, count in word_counts.items():
if count >= min_freq:
vocab[word] = idx
idx += 1
return vocab
# 3. 神经网络模型(类似LeNet,但用于文本)
class TextClassifier(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
super(TextClassifier, self).__init__()
# 嵌入层:将单词索引转换为密集向量(类似图像的第一个全连接层)
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
# LSTM层:处理序列信息(类似CNN的卷积层,提取特征)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
# 全连接层:分类(类似MNIST分类器的最后几层)
self.fc = nn.Linear(hidden_dim, output_dim)
# Dropout防止过拟合
self.dropout = nn.Dropout(0.3)
def forward(self, x):
# x形状: [batch_size, sequence_length]
# 嵌入层: [batch_size, seq_len] -> [batch_size, seq_len, embedding_dim]
embedded = self.embedding(x)
# LSTM层: 提取序列特征
lstm_out, (hidden, _) = self.lstm(embedded)
# 取最后一个时间步的隐藏状态
last_hidden = hidden[-1]
# 全连接层分类
output = self.fc(self.dropout(last_hidden))
return output
# 4. 训练函数(类似MNIST训练循环)
def train_model(model, train_loader, val_loader, epochs=10):
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_losses = []
train_accuracies = []
for epoch in range(epochs):
model.train()
total_loss = 0
correct = 0
total = 0
for batch_idx, (data, target) in enumerate(train_loader):
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
total_loss += loss.item()
_, predicted = torch.max(output.data, 1)
total += target.size(0)
correct += (predicted == target).sum().item()
# 计算准确率
accuracy = 100 * correct / total
avg_loss = total_loss / len(train_loader)
train_losses.append(avg_loss)
train_accuracies.append(accuracy)
# 验证
val_accuracy = evaluate_model(model, val_loader)
print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, '
f'Train Acc: {accuracy:.2f}%, Val Acc: {val_accuracy:.2f}%')
return train_losses, train_accuracies
def evaluate_model(model, loader):
model.eval()
correct = 0
total = 0
with torch.no_grad():
for data, target in loader:
output = model(data)
_, predicted = torch.max(output.data, 1)
total += target.size(0)
correct += (predicted == target).sum().item()
return 100 * correct / total
# 5. 创建示例数据(类似MNIST的示例数据)
def create_sample_data():
# 正面评论
positive_texts = [
"这部电影真是太精彩了,演员表演出色!",
"强烈推荐,值得一看!",
"演技在线,剧情紧凑!",
"幽默风趣,笑点不断!",
"画面精美,音乐动人",
"故事感人,值得深思",
"导演功力深厚,拍摄手法新颖",
"结局出乎意料,令人回味",
"角色塑造成功,情感真挚",
"特效震撼,视觉盛宴"
]
# 负面评论
negative_texts = [
"糟糕的观影体验,剧情拖沓无聊。",
"浪费时间,不建议观看。",
"令人失望的结局。",
"平淡无奇,没有亮点。",
"演技生硬,剧情老套",
"剪辑混乱,看不懂",
"对白尴尬,逻辑不通",
"制作粗糙,毫无诚意",
"节奏太慢,看着想睡",
"原著改编失败,毁经典"
]
texts = positive_texts + negative_texts
labels = [1] * len(positive_texts) + [0] * len(negative_texts) # 1:正面, 0:负面
return texts, labels
# 6. 主程序(类似MNIST的主程序)
def main():
print("=== 文本情感分类 Demo ===")
print("类似手写数字识别,但处理文本数据")
# 准备数据
texts, labels = create_sample_data()
vocab = build_vocab(texts)
print(f"词汇表大小: {len(vocab)}")
print(f"样本数量: {len(texts)}")
# 划分训练集和验证集
split_idx = int(0.8 * len(texts))
train_texts, val_texts = texts[:split_idx], texts[split_idx:]
train_labels, val_labels = labels[:split_idx], labels[split_idx:]
# 创建数据集
train_dataset = TextDataset(train_texts, train_labels, vocab)
val_dataset = TextDataset(val_texts, val_labels, vocab)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2)
# 创建模型
vocab_size = len(vocab)
embedding_dim = 50 # 词向量维度
hidden_dim = 64 # LSTM隐藏层维度
output_dim = 2 # 2个类别:正面/负面
model = TextClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
print(f"\n模型参数:")
print(f"- 词汇表大小: {vocab_size}")
print(f"- 词向量维度: {embedding_dim}")
print(f"- LSTM隐藏层: {hidden_dim}")
print(f"- 输出类别: {output_dim}")
# 训练模型
print("\n开始训练...")
train_losses, train_accuracies = train_model(model, train_loader, val_loader, epochs=15)
# 测试预测
print("\n=== 测试预测 ===")
test_texts = [
"这部电影真的很棒!", # 应该是正面
"太糟糕了,浪费我的时间", # 应该是负面
"还不错,可以看看", # 应该是正面
"演技一般,剧情普通" # 应该是负面
]
model.eval()
for text in test_texts:
# 预处理文本
tokens = train_dataset.text_to_tokens(text)
tokens = tokens[:20] + [0] * (20 - len(tokens))
input_tensor = torch.tensor([tokens], dtype=torch.long)
# 预测
with torch.no_grad():
output = model(input_tensor)
probabilities = torch.softmax(output, dim=1)
predicted_class = torch.argmax(output, dim=1).item()
confidence = probabilities[0][predicted_class].item()
sentiment = "正面" if predicted_class == 1 else "负面"
print(f"文本: '{text}'")
print(f"预测: {sentiment} (置信度: {confidence:.4f})")
print("-" * 50)
if __name__ == "__main__":
main()
原理对比表格
| 组件 | 手写数字识别 | 文本分类 |
|---|---|---|
| 输入数据 | 28×28像素图像 | 单词序列 |
| 数据预处理 | 像素归一化 | 文本分词、建立词汇表 |
| 特征提取 | 卷积层(CNN) | 嵌入层+LSTM |
| 输入表示 | 784维像素向量 | 词向量序列 |
| 分类器 | 全连接层+Softmax | 全连接层+Softmax |
| 损失函数 | 交叉熵损失 | 交叉熵损失 |
| 优化器 | SGD/Adam | SGD/Adam |
关键概念解释
1. 词嵌入(Word Embedding)
python
# 类似将像素值映射到特征空间
embedding = nn.Embedding(vocab_size, embedding_dim)
# 输入: [单词索引] -> 输出: [密集向量]
2. LSTM(长短期记忆网络)
- 处理序列数据的RNN变体
- 可以记住长期依赖关系
- 类似CNN,但用于序列特征提取
3. 文本预处理流程
text
原始文本 → 分词 → 建立词汇表 → 转数字序列 → 填充/截断 → 神经网络
运行结果示例
text
=== 文本情感分类 Demo ===
词汇表大小: 85
样本数量: 20
开始训练...
Epoch 1/15, Loss: 0.7123, Train Acc: 50.00%, Val Acc: 50.00%
Epoch 2/15, Loss: 0.6892, Train Acc: 56.25%, Val Acc: 50.00%
...
Epoch 15/15, Loss: 0.2314, Train Acc: 93.75%, Val Acc: 75.00%
=== 测试预测 ===
文本: '这部电影真的很棒!'
预测: 正面 (置信度: 0.8923)
--------------------------------------------------
文本: '太糟糕了,浪费我的时间'
预测: 负面 (置信度: 0.8456)
--------------------------------------------------
这个Demo完全模仿了《深度学习入门》的风格,从数据准备到模型训练、预测,每一步都有清晰的对应关系,让NLP初学者能够轻松理解文本分类的原理和实现。