🎯 学习目标:通过动手实践,深入理解强化学习的基本概念和Q-Learning算法。完成本练习后,你将能够独立实现简单的强化学习算法,并将其应用于解决实际问题。
实践一:实现一个简单的Q-Learning算法
在第一个练习中,我们将从零开始实现一个Q-Learning算法,解决经典的"走迷宫"问题。
问题描述
我们有一个5x5的网格世界,智能体从左下角(4,0)出发,目标是到达右上角(0,4)的宝藏位置。网格中可能有一些障碍物,智能体需要绕过这些障碍物找到最短路径。
S: 起点 (Start)
G: 目标 (Goal)
X: 障碍物 (Obstacle)
.: 可通行区域 (Free space)
地图布局:
. . . . .
. X . X .
. . . . .
. X . X .
S . . . G
动作空间
智能体可以执行4个动作:
- 0: 向上移动
- 1: 向右移动
- 2: 向下移动
- 3: 向左移动
如果动作会导致智能体移出网格或撞到障碍物,智能体位置不会改变。
奖励机制
- 到达目标位置:+10 (终止状态)
- 其他所有移动:-1 (鼓励寻找最短路径)
实现步骤
import numpy as np
import random
import matplotlib.pyplot as plt
class GridEnvironment:
def __init__(self):
# 定义网格世界 (0=可通行, 1=障碍物, 2=目标)
self.grid = np.array([
[0, 0, 0, 0, 2], # G在(0,4)
[0, 1, 0, 1, 0],
[0, 0, 0, 0, 0],
[0, 1, 0, 1, 0],
[0, 0, 0, 0, 0] # S在(4,0)
])
self.start_pos = (4, 0)
self.agent_pos = self.start_pos
self.action_space = 4 # 上右下左
self.state_size = self.grid.shape[0] * self.grid.shape[1]
def reset(self):
"""重置环境"""
self.agent_pos = self.start_pos
return self.pos_to_state(self.agent_pos)
def pos_to_state(self, pos):
"""将位置转换为状态编号"""
return pos[0] * self.grid.shape[1] + pos[1]
def state_to_pos(self, state):
"""将状态编号转换为位置"""
row = state // self.grid.shape[1]
col = state % self.grid.shape[1]
return (row, col)
def step(self, action):
"""执行动作"""
# 动作定义: 0=上, 1=右, 2=下, 3=左
moves = [(-1, 0), (0, 1), (1, 0), (0, -1)]
new_row = self.agent_pos[0] + moves[action][0]
new_col = self.agent_pos[1] + moves[action][1]
# 检查边界
if (0 <= new_row < self.grid.shape[0] and
0 <= new_col < self.grid.shape[1] and
self.grid[new_row, new_col] != 1): # 不是障碍物
self.agent_pos = (new_row, new_col)
# 计算奖励
if self.grid[self.agent_pos[0], self.agent_pos[1]] == 2: # 到达目标
reward = 10
done = True
else:
reward = -1
done = False
return self.pos_to_state(self.agent_pos), reward, done
def render(self):
"""可视化当前环境"""
display_grid = self.grid.copy().astype(object)
display_grid[display_grid == 0] = '.'
display_grid[display_grid == 1] = 'X'
display_grid[display_grid == 2] = 'G'
agent_pos = self.agent_pos
if display_grid[agent_pos[0], agent_pos[1]] == '.':
display_grid[agent_pos[0], agent_pos[1]] = 'A'
print(display_grid)
class QLearningAgent:
def __init__(self, state_size, action_size, lr=0.1, gamma=0.95, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
self.state_size = state_size
self.action_size = action_size
self.lr = lr
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_decay = epsilon_decay
self.epsilon_min = epsilon_min
self.q_table = np.zeros((state_size, action_size))
def act(self, state):
"""根据ε-贪婪策略选择动作"""
if np.random.rand() <= self.epsilon:
return random.randrange(self.action_size)
return np.argmax(self.q_table[state])
def update(self, state, action, reward, next_state, done):
"""更新Q值"""
best_next_action = np.max(self.q_table[next_state])
td_target = reward + self.gamma * best_next_action * (not done)
td_error = td_target - self.q_table[state, action]
self.q_table[state, action] += self.lr * td_error
# 减少探索率
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
def train_agent(episodes=1000):
"""训练智能体"""
env = GridEnvironment()
agent = QLearningAgent(env.state_size, env.action_space)
scores = []
for episode in range(episodes):
state = env.reset()
total_reward = 0
done = False
while not done:
action = agent.act(state)
next_state, reward, done = env.step(action)
agent.update(state, action, reward, next_state, done)
state = next_state
total_reward += reward
scores.append(total_reward)
# 每100轮打印一次信息
if (episode + 1) % 100 == 0:
avg_score = np.mean(scores[-100:])
print(f"Episode {episode+1}, Average Score: {avg_score:.2f}, Epsilon: {agent.epsilon:.3f}")
return agent, scores
def test_agent(agent, episodes=5):
"""测试训练好的智能体"""
env = GridEnvironment()
for episode in range(episodes):
state = env.reset()
done = False
step_count = 0
print(f"\n--- 测试回合 {episode+1} ---")
env.render()
while not done and step_count < 20: # 限制最大步数
action = np.argmax(agent.q_table[state]) # 使用最佳动作
state, reward, done = env.step(action)
step_count += 1
print(f"动作: {['上', '右', '下', '左'][action]}, 奖励: {reward}")
env.render()
if done:
print("✅ 成功到达目标!")
else:
print("❌ 未能在限定步数内到达目标")
# 运行训练和测试
if __name__ == "__main__":
print("开始训练Q-Learning智能体...")
trained_agent, scores = train_agent(episodes=1000)
print("\n训练完成,开始测试...")
test_agent(trained_agent, episodes=3)
# 绘制训练过程中的得分
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(scores)
plt.title('训练过程中的得分')
plt.xlabel('回合数')
plt.ylabel('得分')
plt.subplot(1, 2, 2)
avg_scores = [np.mean(scores[i:i+100]) for i in range(0, len(scores)-100)]
plt.plot(avg_scores)
plt.title('平均每100回合得分')
plt.xlabel('回合组 (每组100回合)')
plt.ylabel('平均得分')
plt.tight_layout()
plt.show()
实践二:分析Q-Learning的参数影响
在这个练习中,我们将探索不同参数对Q-Learning算法性能的影响。
学习率 (α) 的影响
学习率决定了我们更新Q值时新信息的重要程度。过高的学习率可能导致算法不稳定,而过低的学习率可能导致收敛缓慢。
def compare_learning_rates():
"""比较不同学习率的影响"""
learning_rates = [0.01, 0.1, 0.5, 0.9]
all_scores = []
for lr in learning_rates:
print(f"测试学习率: {lr}")
env = GridEnvironment()
agent = QLearningAgent(env.state_size, env.action_space, lr=lr)
scores = []
for episode in range(500):
state = env.reset()
total_reward = 0
done = False
while not done:
action = agent.act(state)
next_state, reward, done = env.step(action)
agent.update(state, action, reward, next_state, done)
state = next_state
total_reward += reward
scores.append(total_reward)
all_scores.append(scores)
# 绘制结果
plt.figure(figsize=(10, 6))
for i, lr in enumerate(learning_rates):
avg_scores = [np.mean(all_scores[i][j:j+50]) for j in range(0, len(all_scores[i])-50)]
plt.plot(avg_scores, label=f'学习率={lr}')
plt.title('不同学习率对训练性能的影响')
plt.xlabel('回合组 (每组50回合)')
plt.ylabel('平均得分')
plt.legend()
plt.grid(True)
plt.show()
# 运行比较
# compare_learning_rates()
折扣因子 (γ) 的影响
折扣因子决定了智能体对未来奖励的重视程度。较高的折扣因子使智能体更注重长期奖励,而较低的折扣因子使智能体更注重即时奖励。
def compare_discount_factors():
"""比较不同折扣因子的影响"""
discount_factors = [0.1, 0.5, 0.9, 0.99]
all_scores = []
for gamma in discount_factors:
print(f"测试折扣因子: {gamma}")
env = GridEnvironment()
agent = QLearningAgent(env.state_size, env.action_space, gamma=gamma)
scores = []
for episode in range(500):
state = env.reset()
total_reward = 0
done = False
while not done:
action = agent.act(state)
next_state, reward, done = env.step(action)
agent.update(state, action, reward, next_state, done)
state = next_state
total_reward += reward
scores.append(total_reward)
all_scores.append(scores)
# 绘制结果
plt.figure(figsize=(10, 6))
for i, gamma in enumerate(discount_factors):
avg_scores = [np.mean(all_scores[i][j:j+50]) for j in range(0, len(all_scores[i])-50)]
plt.plot(avg_scores, label=f'折扣因子={gamma}')
plt.title('不同折扣因子对训练性能的影响')
plt.xlabel('回合组 (每组50回合)')
plt.ylabel('平均得分')
plt.legend()
plt.grid(True)
plt.show()
# 运行比较
# compare_discount_factors()
实践三:扩展到更复杂的环境
在这个练习中,我们将创建一个更复杂的环境,考验智能体的泛化能力。
class ComplexGridEnvironment:
def __init__(self):
# 更复杂的8x8网格
self.grid = np.array([
[0, 0, 0, 1, 0, 0, 0, 2], # G在(0,7)
[0, 1, 0, 1, 0, 1, 0, 0],
[0, 1, 0, 0, 0, 1, 1, 0],
[0, 0, 0, 1, 0, 0, 0, 0],
[1, 1, 0, 1, 1, 1, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 1, 0, 1, 1, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 0] # S在(7,0)
])
self.start_pos = (7, 0)
self.agent_pos = self.start_pos
self.action_space = 4
self.state_size = self.grid.shape[0] * self.grid.shape[1]
def reset(self):
self.agent_pos = self.start_pos
return self.pos_to_state(self.agent_pos)
def pos_to_state(self, pos):
return pos[0] * self.grid.shape[1] + pos[1]
def state_to_pos(self, state):
row = state // self.grid.shape[1]
col = state % self.grid.shape[1]
return (row, col)
def step(self, action):
moves = [(-1, 0), (0, 1), (1, 0), (0, -1)]
new_row = self.agent_pos[0] + moves[action][0]
new_col = self.agent_pos[1] + moves[action][1]
if (0 <= new_row < self.grid.shape[0] and
0 <= new_col < self.grid.shape[1] and
self.grid[new_row, new_col] != 1):
self.agent_pos = (new_row, new_col)
if self.grid[self.agent_pos[0], self.agent_pos[1]] == 2:
reward = 20
done = True
else:
reward = -1
done = False
return self.pos_to_state(self.agent_pos), reward, done
def render(self):
display_grid = self.grid.copy().astype(object)
display_grid[display_grid == 0] = '.'
display_grid[display_grid == 1] = 'X'
display_grid[display_grid == 2] = 'G'
agent_pos = self.agent_pos
if display_grid[agent_pos[0], agent_pos[1]] == '.':
display_grid[agent_pos[0], agent_pos[1]] = 'A'
print(display_grid)
def train_on_complex_environment():
"""在复杂环境中训练"""
env = ComplexGridEnvironment()
agent = QLearningAgent(env.state_size, env.action_space, lr=0.1, gamma=0.95)
scores = []
for episode in range(2000):
state = env.reset()
total_reward = 0
done = False
while not done:
action = agent.act(state)
next_state, reward, done = env.step(action)
agent.update(state, action, reward, next_state, done)
state = next_state
total_reward += reward
scores.append(total_reward)
if (episode + 1) % 200 == 0:
avg_score = np.mean(scores[-100:])
print(f"Episode {episode+1}, Average Score: {avg_score:.2f}")
return agent, scores
# 运行复杂环境训练
# complex_agent, complex_scores = train_on_complex_environment()
本章小结
通过以上实践,你应该已经掌握了:
- 如何从零开始实现一个完整的Q-Learning算法
- 如何设计环境、状态、动作和奖励机制
- 如何调试和优化强化学习算法的超参数
- 如何评估智能体的性能
这些实践为你进一步学习深度强化学习(如DQN、PPO等)打下了坚实的基础。
进阶挑战
-
实现ε衰减策略:尝试不同的ε衰减策略,如指数衰减、线性衰减等,观察对训练效果的影响。
-
添加动态环境:创建一个障碍物会移动的环境,考验智能体的适应能力。
-
多智能体场景:设计一个有两个智能体协作或竞争的环境。
-
实现其他RL算法:尝试实现SARSA算法,并与Q-Learning进行比较。
💡 提示:强化学习算法的调试往往比监督学习更困难,因为其性能依赖于与环境的交互。建议在简单环境中充分测试算法后再扩展到复杂场景。