在前面的章节中,我们学习了基于价值的强化学习方法,如Q-Learning和DQN。这些方法通过估计状态或状态-动作的价值来学习最优策略。今天,我们将探索另一种重要的强化学习范式——策略梯度方法。
策略梯度方法概述
策略梯度方法直接参数化策略函数π(a|s;θ),并通过梯度上升来优化策略参数θ,以最大化期望回报。与基于价值的方法相比,策略梯度方法具有以下优势:
graph TD
A[强化学习方法] --> B[基于价值的方法]
A --> C[基于策略的方法]
A --> D[演员-评论家方法]
B --> B1[Q-Learning]
B --> B2[DQN]
B --> B3[Double DQN]
C --> C1[REINFORCE]
C --> C2[REINFORCE with Baseline]
D --> D1[Actor-Critic]
D --> D2[A3C]
D --> D3[PPO]
D --> D4[SAC]
style A fill:#f4a261,stroke:#333
style B fill:#2a9d8f,stroke:#333
style C fill:#e76f51,stroke:#333
style D fill:#264653,stroke:#333
策略梯度定理
策略梯度定理是策略梯度方法的理论基础。它表明策略的性能梯度可以表示为:
其中τ表示轨迹,πθ表示参数化的策略,Q^πθ表示策略πθ对应的价值函数。
REINFORCE算法
REINFORCE是最基础的策略梯度算法,它使用蒙特卡洛方法来估计回报。
算法原理
REINFORCE算法通过以下步骤工作:
- 从当前策略采样完整轨迹
- 计算每个时间步的回报
- 使用策略梯度定理更新策略参数
代码实现
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import gym
from collections import deque
import random
class PolicyNetwork(nn.Module):
"""策略网络"""
def __init__(self, state_size, action_size, hidden_size=64):
super(PolicyNetwork, self).__init__()
self.fc1 = nn.Linear(state_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, action_size)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return F.softmax(x, dim=-1) # 输出动作概率分布
class REINFORCEAgent:
"""REINFORCE智能体"""
def __init__(self, state_size, action_size, lr=0.01, gamma=0.99):
self.state_size = state_size
self.action_size = action_size
self.gamma = gamma
# 策略网络
self.policy_network = PolicyNetwork(state_size, action_size)
self.optimizer = optim.Adam(self.policy_network.parameters(), lr=lr)
# 存储轨迹
self.saved_log_probs = []
self.rewards = []
def select_action(self, state):
"""选择动作"""
state = torch.from_numpy(state).float().unsqueeze(0)
probs = self.policy_network(state)
action_dist = torch.distributions.Categorical(probs)
action = action_dist.sample()
# 保存log概率
self.saved_log_probs.append(action_dist.log_prob(action))
return action.item()
def update(self):
"""更新策略"""
# 计算回报
R = 0
returns = []
for r in reversed(self.rewards):
R = r + self.gamma * R
returns.insert(0, R)
# 转换为张量
returns = torch.tensor(returns)
# 标准化回报以稳定训练
returns = (returns - returns.mean()) / (returns.std() + 1e-8)
# 计算损失
policy_loss = []
for log_prob, R in zip(self.saved_log_probs, returns):
policy_loss.append(-log_prob * R)
# 执行优化步骤
self.optimizer.zero_grad()
policy_loss = torch.cat(policy_loss).sum()
policy_loss.backward()
self.optimizer.step()
# 清空缓存
del self.rewards[:]
del self.saved_log_probs[:]
def train_reinforce(env_name='CartPole-v1', episodes=1000):
"""训练REINFORCE智能体"""
env = gym.make(env_name)
agent = REINFORCEAgent(env.observation_space.shape[0], env.action_space.n)
scores = deque(maxlen=100)
avg_scores = []
for episode in range(episodes):
state = env.reset()
if isinstance(state, tuple):
state = state[0]
episode_reward = 0
done = False
# 收集轨迹
while not done:
action = agent.select_action(state)
result = env.step(action)
if len(result) == 4:
next_state, reward, done, _ = result
else:
next_state, reward, terminated, truncated, _ = result
done = terminated or truncated
agent.rewards.append(reward)
state = next_state
episode_reward += reward
# 更新策略
agent.update()
scores.append(episode_reward)
avg_scores.append(np.mean(scores))
if episode % 100 == 0:
print(f'Episode {episode}, Average Score: {np.mean(scores):.2f}')
return agent, avg_scores
# 运行训练
# agent, scores = train_reinforce()
REINFORCE with Baseline
为了减少策略梯度的方差,我们可以引入基线(baseline):
其中b(s)是状态s的基线,通常使用状态价值函数V(s)。
class ActorCriticNetwork(nn.Module):
"""演员-评论家网络"""
def __init__(self, state_size, action_size, hidden_size=64):
super(ActorCriticNetwork, self).__init__()
self.fc1 = nn.Linear(state_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
# 演员网络(策略)
self.actor = nn.Linear(hidden_size, action_size)
# 评论家网络(价值)
self.critic = nn.Linear(hidden_size, 1)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
actor_out = F.softmax(self.actor(x), dim=-1)
critic_out = self.critic(x)
return actor_out, critic_out
class REINFORCEBaselineAgent:
"""带基线的REINFORCE智能体"""
def __init__(self, state_size, action_size, lr=0.001, gamma=0.99):
self.state_size = state_size
self.action_size = action_size
self.gamma = gamma
self.network = ActorCriticNetwork(state_size, action_size)
self.optimizer = optim.Adam(self.network.parameters(), lr=lr)
self.saved_log_probs = []
self.values = []
self.rewards = []
def select_action(self, state):
"""选择动作"""
state = torch.from_numpy(state).float().unsqueeze(0)
probs, value = self.network(state)
action_dist = torch.distributions.Categorical(probs)
action = action_dist.sample()
self.saved_log_probs.append(action_dist.log_prob(action))
self.values.append(value)
return action.item()
def update(self):
"""更新策略和价值网络"""
# 计算回报
R = 0
returns = []
for r in reversed(self.rewards):
R = r + self.gamma * R
returns.insert(0, R)
returns = torch.tensor(returns)
# 标准化回报
returns = (returns - returns.mean()) / (returns.std() + 1e-8)
# 计算优势函数
values = torch.cat(self.values).squeeze()
advantages = returns - values
# 计算策略损失和价值损失
actor_loss = []
critic_loss = []
for log_prob, advantage, value, R in zip(
self.saved_log_probs, advantages, values, returns):
actor_loss.append(-log_prob * advantage)
critic_loss.append(F.mse_loss(value, R.unsqueeze(0)))
# 总损失
total_loss = torch.stack(actor_loss).sum() + torch.stack(critic_loss).sum()
# 执行优化步骤
self.optimizer.zero_grad()
total_loss.backward()
self.optimizer.step()
# 清空缓存
del self.rewards[:]
del self.saved_log_probs[:]
del self.values[:]
# 训练带基线的REINFORCE
# agent, scores = train_reinforce_with_baseline()
近端策略优化(PPO)
PPO是一种先进的策略梯度方法,通过限制策略更新的幅度来提高训练的稳定性。
PPO-Clip算法
PPO-Clip使用一个剪裁机制来限制新策略与旧策略的比率:
其中rt(θ) = πθ(at|st) / πθ_old(at|st)是策略比率,At是优势函数。
class PPOAgent:
"""PPO智能体"""
def __init__(self, state_size, action_size, lr=3e-4, gamma=0.99,
eps_clip=0.2, K_epochs=4, hidden_size=64):
self.gamma = gamma
self.eps_clip = eps_clip
self.K_epochs = K_epochs
self.network = ActorCriticNetwork(state_size, action_size, hidden_size)
self.optimizer = optim.Adam(self.network.parameters(), lr=lr)
self.buffer = []
def select_action(self, state):
"""选择动作"""
state = torch.FloatTensor(state).unsqueeze(0)
probs, value = self.network(state)
action_dist = torch.distributions.Categorical(probs)
action = action_dist.sample()
return action.item(), action_dist.log_prob(action), value
def store_transition(self, transition):
"""存储转换"""
self.buffer.append(transition)
def update(self):
"""更新网络"""
if len(self.buffer) == 0:
return
# 展开缓冲区
states = torch.FloatTensor([t[0] for t in self.buffer])
actions = torch.LongTensor([t[1] for t in self.buffer])
old_log_probs = torch.stack([t[2] for t in self.buffer])
rewards = [t[3] for t in self.buffer]
dones = [t[4] for t in self.buffer]
values = torch.cat([t[5] for t in self.buffer]).squeeze()
# 计算回报和优势
returns = []
discounted_reward = 0
for reward, done in zip(reversed(rewards), reversed(dones)):
if done:
discounted_reward = 0
discounted_reward = reward + self.gamma * discounted_reward
returns.insert(0, discounted_reward)
returns = torch.FloatTensor(returns)
# 标准化回报
returns = (returns - returns.mean()) / (returns.std() + 1e-8)
# 计算优势
advantages = returns - values
# 多轮更新
for _ in range(self.K_epochs):
# 重新计算概率比率
probs, new_values = self.network(states)
dist = torch.distributions.Categorical(probs)
new_log_probs = dist.log_prob(actions)
ratios = torch.exp(new_log_probs - old_log_probs)
# 计算损失
surr1 = ratios * advantages
surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
actor_loss = -torch.min(surr1, surr2).mean()
critic_loss = F.mse_loss(new_values.squeeze(), returns)
# 总损失
loss = actor_loss + 0.5 * critic_loss
# 执行优化步骤
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# 清空缓冲区
self.buffer = []
def train_ppo(env_name='CartPole-v1', episodes=1000):
"""训练PPO智能体"""
env = gym.make(env_name)
agent = PPOAgent(env.observation_space.shape[0], env.action_space.n)
scores = deque(maxlen=100)
avg_scores = []
for episode in range(episodes):
state = env.reset()
if isinstance(state, tuple):
state = state[0]
episode_reward = 0
done = False
while not done:
action, log_prob, value = agent.select_action(state)
result = env.step(action)
if len(result) == 4:
next_state, reward, done, _ = result
else:
next_state, reward, terminated, truncated, _ = result
done = terminated or truncated
# 存储转换
agent.store_transition((state, action, log_prob, reward, done, value))
state = next_state
episode_reward += reward
# 更新策略
agent.update()
scores.append(episode_reward)
avg_scores.append(np.mean(scores))
if episode % 100 == 0:
print(f'Episode {episode}, Average Score: {np.mean(scores):.2f}')
return agent, avg_scores
# 运行PPO训练
# ppo_agent, ppo_scores = train_ppo()
算法比较与分析
让我们比较不同策略梯度算法的性能:
import matplotlib.pyplot as plt
def compare_algorithms():
"""比较不同算法的性能"""
# 这里我们模拟不同算法的训练曲线
episodes = list(range(1000))
# 模拟REINFORCE性能
reinforce_scores = [min(50, 10 + 0.02 * i + np.random.normal(0, 2)) for i in episodes]
reinforce_avg = [np.mean(reinforce_scores[max(0, i-100):i+1]) for i in episodes]
# 模拟REINFORCE with Baseline性能
baseline_scores = [min(80, 20 + 0.04 * i + np.random.normal(0, 2)) for i in episodes]
baseline_avg = [np.mean(baseline_scores[max(0, i-100):i+1]) for i in episodes]
# 模拟PPO性能
ppo_scores = [min(200, 30 + 0.1 * i + np.random.normal(0, 3)) for i in episodes]
ppo_avg = [np.mean(ppo_scores[max(0, i-100):i+1]) for i in episodes]
# 绘制比较图
plt.figure(figsize=(12, 6))
plt.plot(episodes, reinforce_avg, label='REINFORCE', linewidth=2)
plt.plot(episodes, baseline_avg, label='REINFORCE with Baseline', linewidth=2)
plt.plot(episodes, ppo_avg, label='PPO', linewidth=2)
plt.xlabel('训练回合数')
plt.ylabel('平均得分 (最近100回合)')
plt.title('策略梯度算法性能比较')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# 打印最终性能
print("算法性能比较 (最后100回合平均得分):")
print(f"REINFORCE: {reinforce_avg[-1]:.2f}")
print(f"REINFORCE with Baseline: {baseline_avg[-1]:.2f}")
print(f"PPO: {ppo_avg[-1]:.2f}")
# compare_algorithms()
本章小结
在本章中,我们深入学习了策略梯度方法,包括:
- REINFORCE算法:最基础的策略梯度方法,使用蒙特卡洛采样
- REINFORCE with Baseline:引入基线减少方差,提高训练稳定性
- PPO算法:先进的策略优化方法,通过剪裁机制限制更新幅度
策略梯度方法相比基于价值的方法具有以下优势:
- 可以学习随机策略,适用于确定性策略不理想的环境
- 更适合处理连续动作空间问题
- 直接优化目标函数,避免了价值函数估计的误差传播
进阶学习建议
- Soft Actor-Critic (SAC):探索最大熵强化学习框架
- Trust Region Policy Optimization (TRPO):了解PPO的前身算法
- 多智能体策略梯度:学习多智能体环境中的策略优化
- 模仿学习与逆强化学习:从专家示范中学习策略
练习题
- 实现并比较不同学习率对REINFORCE算法的影响
- 尝试将PPO应用于连续动作空间环境(如Pendulum-v1)
- 实现异步版本的策略梯度算法(A3C)
- 研究不同基线函数对算法性能的影响
💡 提示:策略梯度方法的训练通常比基于价值的方法更加不稳定,需要仔细调整超参数。建议从简单的环境开始,逐步增加复杂性。