深度学习进阶: 自然语言处理原理

52 阅读6分钟

文本分类原理讲解(类比手写数字识别)

核心思想对比:

  • 手写数字识别:28×28像素图像 → 784维向量 → 神经网络 → 10个数字类别
  • 文本分类:单词序列 → 词向量 → 神经网络 → 文本类别

简单文本分类Demo:电影评论情感分析

python

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
import re

# 1. 数据准备(类似MNIST数据加载)
class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_length=20):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # 文本转数字序列(类似图像像素转向量)
        tokens = self.text_to_tokens(text)
        tokens = tokens[:self.max_length]  # 截断
        tokens += [0] * (self.max_length - len(tokens))  # 填充
        
        return torch.tensor(tokens, dtype=torch.long), torch.tensor(label, dtype=torch.long)
    
    def text_to_tokens(self, text):
        # 简单分词和清洗
        words = re.findall(r'\b\w+\b', text.lower())
        return [self.vocab.get(word, 1) for word in words]  # 1是未知词标记

# 2. 构建词汇表(类似MNIST的像素归一化)
def build_vocab(texts, min_freq=2):
    word_counts = Counter()
    for text in texts:
        words = re.findall(r'\b\w+\b', text.lower())
        word_counts.update(words)
    
    # 创建词汇表:word -> index
    vocab = {'<PAD>': 0, '<UNK>': 1}
    idx = 2
    for word, count in word_counts.items():
        if count >= min_freq:
            vocab[word] = idx
            idx += 1
    
    return vocab

# 3. 神经网络模型(类似LeNet,但用于文本)
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(TextClassifier, self).__init__()
        
        # 嵌入层:将单词索引转换为密集向量(类似图像的第一个全连接层)
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # LSTM层:处理序列信息(类似CNN的卷积层,提取特征)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        
        # 全连接层:分类(类似MNIST分类器的最后几层)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        # Dropout防止过拟合
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, x):
        # x形状: [batch_size, sequence_length]
        
        # 嵌入层: [batch_size, seq_len] -> [batch_size, seq_len, embedding_dim]
        embedded = self.embedding(x)
        
        # LSTM层: 提取序列特征
        lstm_out, (hidden, _) = self.lstm(embedded)
        
        # 取最后一个时间步的隐藏状态
        last_hidden = hidden[-1]
        
        # 全连接层分类
        output = self.fc(self.dropout(last_hidden))
        
        return output

# 4. 训练函数(类似MNIST训练循环)
def train_model(model, train_loader, val_loader, epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    train_losses = []
    train_accuracies = []
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        for batch_idx, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            
            output = model(data)
            loss = criterion(output, target)
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            _, predicted = torch.max(output.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()
        
        # 计算准确率
        accuracy = 100 * correct / total
        avg_loss = total_loss / len(train_loader)
        
        train_losses.append(avg_loss)
        train_accuracies.append(accuracy)
        
        # 验证
        val_accuracy = evaluate_model(model, val_loader)
        
        print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, '
              f'Train Acc: {accuracy:.2f}%, Val Acc: {val_accuracy:.2f}%')
    
    return train_losses, train_accuracies

def evaluate_model(model, loader):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for data, target in loader:
            output = model(data)
            _, predicted = torch.max(output.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()
    
    return 100 * correct / total

# 5. 创建示例数据(类似MNIST的示例数据)
def create_sample_data():
    # 正面评论
    positive_texts = [
        "这部电影真是太精彩了,演员表演出色!",
        "强烈推荐,值得一看!",
        "演技在线,剧情紧凑!",
        "幽默风趣,笑点不断!",
        "画面精美,音乐动人",
        "故事感人,值得深思",
        "导演功力深厚,拍摄手法新颖",
        "结局出乎意料,令人回味",
        "角色塑造成功,情感真挚",
        "特效震撼,视觉盛宴"
    ]
    
    # 负面评论
    negative_texts = [
        "糟糕的观影体验,剧情拖沓无聊。",
        "浪费时间,不建议观看。",
        "令人失望的结局。",
        "平淡无奇,没有亮点。",
        "演技生硬,剧情老套",
        "剪辑混乱,看不懂",
        "对白尴尬,逻辑不通",
        "制作粗糙,毫无诚意",
        "节奏太慢,看着想睡",
        "原著改编失败,毁经典"
    ]
    
    texts = positive_texts + negative_texts
    labels = [1] * len(positive_texts) + [0] * len(negative_texts)  # 1:正面, 0:负面
    
    return texts, labels

# 6. 主程序(类似MNIST的主程序)
def main():
    print("=== 文本情感分类 Demo ===")
    print("类似手写数字识别,但处理文本数据")
    
    # 准备数据
    texts, labels = create_sample_data()
    vocab = build_vocab(texts)
    
    print(f"词汇表大小: {len(vocab)}")
    print(f"样本数量: {len(texts)}")
    
    # 划分训练集和验证集
    split_idx = int(0.8 * len(texts))
    train_texts, val_texts = texts[:split_idx], texts[split_idx:]
    train_labels, val_labels = labels[:split_idx], labels[split_idx:]
    
    # 创建数据集
    train_dataset = TextDataset(train_texts, train_labels, vocab)
    val_dataset = TextDataset(val_texts, val_labels, vocab)
    
    train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=2)
    
    # 创建模型
    vocab_size = len(vocab)
    embedding_dim = 50  # 词向量维度
    hidden_dim = 64     # LSTM隐藏层维度
    output_dim = 2      # 2个类别:正面/负面
    
    model = TextClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
    
    print(f"\n模型参数:")
    print(f"- 词汇表大小: {vocab_size}")
    print(f"- 词向量维度: {embedding_dim}")
    print(f"- LSTM隐藏层: {hidden_dim}")
    print(f"- 输出类别: {output_dim}")
    
    # 训练模型
    print("\n开始训练...")
    train_losses, train_accuracies = train_model(model, train_loader, val_loader, epochs=15)
    
    # 测试预测
    print("\n=== 测试预测 ===")
    test_texts = [
        "这部电影真的很棒!",           # 应该是正面
        "太糟糕了,浪费我的时间",       # 应该是负面
        "还不错,可以看看",            # 应该是正面
        "演技一般,剧情普通"           # 应该是负面
    ]
    
    model.eval()
    for text in test_texts:
        # 预处理文本
        tokens = train_dataset.text_to_tokens(text)
        tokens = tokens[:20] + [0] * (20 - len(tokens))
        input_tensor = torch.tensor([tokens], dtype=torch.long)
        
        # 预测
        with torch.no_grad():
            output = model(input_tensor)
            probabilities = torch.softmax(output, dim=1)
            predicted_class = torch.argmax(output, dim=1).item()
            confidence = probabilities[0][predicted_class].item()
        
        sentiment = "正面" if predicted_class == 1 else "负面"
        print(f"文本: '{text}'")
        print(f"预测: {sentiment} (置信度: {confidence:.4f})")
        print("-" * 50)

if __name__ == "__main__":
    main()

原理对比表格

组件手写数字识别文本分类
输入数据28×28像素图像单词序列
数据预处理像素归一化文本分词、建立词汇表
特征提取卷积层(CNN)嵌入层+LSTM
输入表示784维像素向量词向量序列
分类器全连接层+Softmax全连接层+Softmax
损失函数交叉熵损失交叉熵损失
优化器SGD/AdamSGD/Adam

关键概念解释

1. 词嵌入(Word Embedding)

python

# 类似将像素值映射到特征空间
embedding = nn.Embedding(vocab_size, embedding_dim)
# 输入: [单词索引] -> 输出: [密集向量]

2. LSTM(长短期记忆网络)

  • 处理序列数据的RNN变体
  • 可以记住长期依赖关系
  • 类似CNN,但用于序列特征提取

3. 文本预处理流程

text

原始文本 → 分词 → 建立词汇表 → 转数字序列 → 填充/截断 → 神经网络

运行结果示例

text

=== 文本情感分类 Demo ===
词汇表大小: 85
样本数量: 20

开始训练...
Epoch 1/15, Loss: 0.7123, Train Acc: 50.00%, Val Acc: 50.00%
Epoch 2/15, Loss: 0.6892, Train Acc: 56.25%, Val Acc: 50.00%
...
Epoch 15/15, Loss: 0.2314, Train Acc: 93.75%, Val Acc: 75.00%

=== 测试预测 ===
文本: '这部电影真的很棒!'
预测: 正面 (置信度: 0.8923)
--------------------------------------------------
文本: '太糟糕了,浪费我的时间'
预测: 负面 (置信度: 0.8456)
--------------------------------------------------

这个Demo完全模仿了《深度学习入门》的风格,从数据准备到模型训练、预测,每一步都有清晰的对应关系,让NLP初学者能够轻松理解文本分类的原理和实现。