强化学习实战练习：从理论到实践的桥梁> 🎯 学习目标：通过动手实践，深入理解强化学习的基本概念和Q-Learning算

🎯 学习目标：通过动手实践，深入理解强化学习的基本概念和Q-Learning算法。完成本练习后，你将能够独立实现简单的强化学习算法，并将其应用于解决实际问题。

实践一：实现一个简单的Q-Learning算法

在第一个练习中，我们将从零开始实现一个Q-Learning算法，解决经典的"走迷宫"问题。

问题描述

我们有一个5x5的网格世界，智能体从左下角(4,0)出发，目标是到达右上角(0,4)的宝藏位置。网格中可能有一些障碍物，智能体需要绕过这些障碍物找到最短路径。

S: 起点 (Start)
G: 目标 (Goal)
X: 障碍物 (Obstacle)
.: 可通行区域 (Free space)

地图布局:
. . . . .
. X . X .
. . . . .
. X . X .
S . . . G

动作空间

智能体可以执行4个动作：

0: 向上移动
1: 向右移动
2: 向下移动
3: 向左移动

如果动作会导致智能体移出网格或撞到障碍物，智能体位置不会改变。

奖励机制

到达目标位置：+10 (终止状态)
其他所有移动：-1 (鼓励寻找最短路径)

实现步骤

import numpy as np
import random
import matplotlib.pyplot as plt

class GridEnvironment:
    def __init__(self):
        # 定义网格世界 (0=可通行, 1=障碍物, 2=目标)
        self.grid = np.array([
            [0, 0, 0, 0, 2],  # G在(0,4)
            [0, 1, 0, 1, 0],
            [0, 0, 0, 0, 0],
            [0, 1, 0, 1, 0],
            [0, 0, 0, 0, 0]   # S在(4,0)
        ])
        self.start_pos = (4, 0)
        self.agent_pos = self.start_pos
        self.action_space = 4  # 上右下左
        self.state_size = self.grid.shape[0] * self.grid.shape[1]
        
    def reset(self):
        """重置环境"""
        self.agent_pos = self.start_pos
        return self.pos_to_state(self.agent_pos)
    
    def pos_to_state(self, pos):
        """将位置转换为状态编号"""
        return pos[0] * self.grid.shape[1] + pos[1]
    
    def state_to_pos(self, state):
        """将状态编号转换为位置"""
        row = state // self.grid.shape[1]
        col = state % self.grid.shape[1]
        return (row, col)
    
    def step(self, action):
        """执行动作"""
        # 动作定义: 0=上, 1=右, 2=下, 3=左
        moves = [(-1, 0), (0, 1), (1, 0), (0, -1)]
        new_row = self.agent_pos[0] + moves[action][0]
        new_col = self.agent_pos[1] + moves[action][1]
        
        # 检查边界
        if (0 <= new_row < self.grid.shape[0] and 
            0 <= new_col < self.grid.shape[1] and
            self.grid[new_row, new_col] != 1):  # 不是障碍物
            self.agent_pos = (new_row, new_col)
        
        # 计算奖励
        if self.grid[self.agent_pos[0], self.agent_pos[1]] == 2:  # 到达目标
            reward = 10
            done = True
        else:
            reward = -1
            done = False
            
        return self.pos_to_state(self.agent_pos), reward, done
    
    def render(self):
        """可视化当前环境"""
        display_grid = self.grid.copy().astype(object)
        display_grid[display_grid == 0] = '.'
        display_grid[display_grid == 1] = 'X'
        display_grid[display_grid == 2] = 'G'
        agent_pos = self.agent_pos
        if display_grid[agent_pos[0], agent_pos[1]] == '.':
            display_grid[agent_pos[0], agent_pos[1]] = 'A'
        print(display_grid)

class QLearningAgent:
    def __init__(self, state_size, action_size, lr=0.1, gamma=0.95, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.q_table = np.zeros((state_size, action_size))
    
    def act(self, state):
        """根据ε-贪婪策略选择动作"""
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        return np.argmax(self.q_table[state])
    
    def update(self, state, action, reward, next_state, done):
        """更新Q值"""
        best_next_action = np.max(self.q_table[next_state])
        td_target = reward + self.gamma * best_next_action * (not done)
        td_error = td_target - self.q_table[state, action]
        self.q_table[state, action] += self.lr * td_error
        
        # 减少探索率
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

def train_agent(episodes=1000):
    """训练智能体"""
    env = GridEnvironment()
    agent = QLearningAgent(env.state_size, env.action_space)
    
    scores = []
    
    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False
        
        while not done:
            action = agent.act(state)
            next_state, reward, done = env.step(action)
            agent.update(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
        
        scores.append(total_reward)
        
        # 每100轮打印一次信息
        if (episode + 1) % 100 == 0:
            avg_score = np.mean(scores[-100:])
            print(f"Episode {episode+1}, Average Score: {avg_score:.2f}, Epsilon: {agent.epsilon:.3f}")
    
    return agent, scores

def test_agent(agent, episodes=5):
    """测试训练好的智能体"""
    env = GridEnvironment()
    
    for episode in range(episodes):
        state = env.reset()
        done = False
        step_count = 0
        print(f"\n--- 测试回合 {episode+1} ---")
        env.render()
        
        while not done and step_count < 20:  # 限制最大步数
            action = np.argmax(agent.q_table[state])  # 使用最佳动作
            state, reward, done = env.step(action)
            step_count += 1
            print(f"动作: {['上', '右', '下', '左'][action]}, 奖励: {reward}")
            env.render()
        
        if done:
            print("✅ 成功到达目标!")
        else:
            print("❌ 未能在限定步数内到达目标")

# 运行训练和测试
if __name__ == "__main__":
    print("开始训练Q-Learning智能体...")
    trained_agent, scores = train_agent(episodes=1000)
    
    print("\n训练完成，开始测试...")
    test_agent(trained_agent, episodes=3)
    
    # 绘制训练过程中的得分
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(scores)
    plt.title('训练过程中的得分')
    plt.xlabel('回合数')
    plt.ylabel('得分')
    
    plt.subplot(1, 2, 2)
    avg_scores = [np.mean(scores[i:i+100]) for i in range(0, len(scores)-100)]
    plt.plot(avg_scores)
    plt.title('平均每100回合得分')
    plt.xlabel('回合组 (每组100回合)')
    plt.ylabel('平均得分')
    
    plt.tight_layout()
    plt.show()

实践二：分析Q-Learning的参数影响

在这个练习中，我们将探索不同参数对Q-Learning算法性能的影响。

学习率 (α) 的影响

学习率决定了我们更新Q值时新信息的重要程度。过高的学习率可能导致算法不稳定，而过低的学习率可能导致收敛缓慢。

def compare_learning_rates():
    """比较不同学习率的影响"""
    learning_rates = [0.01, 0.1, 0.5, 0.9]
    all_scores = []
    
    for lr in learning_rates:
        print(f"测试学习率: {lr}")
        env = GridEnvironment()
        agent = QLearningAgent(env.state_size, env.action_space, lr=lr)
        scores = []
        
        for episode in range(500):
            state = env.reset()
            total_reward = 0
            done = False
            
            while not done:
                action = agent.act(state)
                next_state, reward, done = env.step(action)
                agent.update(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward
            
            scores.append(total_reward)
        
        all_scores.append(scores)
    
    # 绘制结果
    plt.figure(figsize=(10, 6))
    for i, lr in enumerate(learning_rates):
        avg_scores = [np.mean(all_scores[i][j:j+50]) for j in range(0, len(all_scores[i])-50)]
        plt.plot(avg_scores, label=f'学习率={lr}')
    
    plt.title('不同学习率对训练性能的影响')
    plt.xlabel('回合组 (每组50回合)')
    plt.ylabel('平均得分')
    plt.legend()
    plt.grid(True)
    plt.show()

# 运行比较
# compare_learning_rates()

折扣因子 (γ) 的影响

折扣因子决定了智能体对未来奖励的重视程度。较高的折扣因子使智能体更注重长期奖励，而较低的折扣因子使智能体更注重即时奖励。

def compare_discount_factors():
    """比较不同折扣因子的影响"""
    discount_factors = [0.1, 0.5, 0.9, 0.99]
    all_scores = []
    
    for gamma in discount_factors:
        print(f"测试折扣因子: {gamma}")
        env = GridEnvironment()
        agent = QLearningAgent(env.state_size, env.action_space, gamma=gamma)
        scores = []
        
        for episode in range(500):
            state = env.reset()
            total_reward = 0
            done = False
            
            while not done:
                action = agent.act(state)
                next_state, reward, done = env.step(action)
                agent.update(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward
            
            scores.append(total_reward)
        
        all_scores.append(scores)
    
    # 绘制结果
    plt.figure(figsize=(10, 6))
    for i, gamma in enumerate(discount_factors):
        avg_scores = [np.mean(all_scores[i][j:j+50]) for j in range(0, len(all_scores[i])-50)]
        plt.plot(avg_scores, label=f'折扣因子={gamma}')
    
    plt.title('不同折扣因子对训练性能的影响')
    plt.xlabel('回合组 (每组50回合)')
    plt.ylabel('平均得分')
    plt.legend()
    plt.grid(True)
    plt.show()

# 运行比较
# compare_discount_factors()

实践三：扩展到更复杂的环境

在这个练习中，我们将创建一个更复杂的环境，考验智能体的泛化能力。

class ComplexGridEnvironment:
    def __init__(self):
        # 更复杂的8x8网格
        self.grid = np.array([
            [0, 0, 0, 1, 0, 0, 0, 2],  # G在(0,7)
            [0, 1, 0, 1, 0, 1, 0, 0],
            [0, 1, 0, 0, 0, 1, 1, 0],
            [0, 0, 0, 1, 0, 0, 0, 0],
            [1, 1, 0, 1, 1, 1, 0, 1],
            [0, 0, 0, 0, 0, 0, 0, 0],
            [0, 1, 1, 0, 1, 1, 1, 0],
            [0, 0, 0, 0, 0, 0, 0, 0]   # S在(7,0)
        ])
        self.start_pos = (7, 0)
        self.agent_pos = self.start_pos
        self.action_space = 4
        self.state_size = self.grid.shape[0] * self.grid.shape[1]
        
    def reset(self):
        self.agent_pos = self.start_pos
        return self.pos_to_state(self.agent_pos)
    
    def pos_to_state(self, pos):
        return pos[0] * self.grid.shape[1] + pos[1]
    
    def state_to_pos(self, state):
        row = state // self.grid.shape[1]
        col = state % self.grid.shape[1]
        return (row, col)
    
    def step(self, action):
        moves = [(-1, 0), (0, 1), (1, 0), (0, -1)]
        new_row = self.agent_pos[0] + moves[action][0]
        new_col = self.agent_pos[1] + moves[action][1]
        
        if (0 <= new_row < self.grid.shape[0] and 
            0 <= new_col < self.grid.shape[1] and
            self.grid[new_row, new_col] != 1):
            self.agent_pos = (new_row, new_col)
        
        if self.grid[self.agent_pos[0], self.agent_pos[1]] == 2:
            reward = 20
            done = True
        else:
            reward = -1
            done = False
            
        return self.pos_to_state(self.agent_pos), reward, done
    
    def render(self):
        display_grid = self.grid.copy().astype(object)
        display_grid[display_grid == 0] = '.'
        display_grid[display_grid == 1] = 'X'
        display_grid[display_grid == 2] = 'G'
        agent_pos = self.agent_pos
        if display_grid[agent_pos[0], agent_pos[1]] == '.':
            display_grid[agent_pos[0], agent_pos[1]] = 'A'
        print(display_grid)

def train_on_complex_environment():
    """在复杂环境中训练"""
    env = ComplexGridEnvironment()
    agent = QLearningAgent(env.state_size, env.action_space, lr=0.1, gamma=0.95)
    
    scores = []
    for episode in range(2000):
        state = env.reset()
        total_reward = 0
        done = False
        
        while not done:
            action = agent.act(state)
            next_state, reward, done = env.step(action)
            agent.update(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
        
        scores.append(total_reward)
        
        if (episode + 1) % 200 == 0:
            avg_score = np.mean(scores[-100:])
            print(f"Episode {episode+1}, Average Score: {avg_score:.2f}")
    
    return agent, scores

# 运行复杂环境训练
# complex_agent, complex_scores = train_on_complex_environment()

本章小结

通过以上实践，你应该已经掌握了：

如何从零开始实现一个完整的Q-Learning算法
如何设计环境、状态、动作和奖励机制
如何调试和优化强化学习算法的超参数
如何评估智能体的性能

这些实践为你进一步学习深度强化学习（如DQN、PPO等）打下了坚实的基础。

进阶挑战

实现ε衰减策略：尝试不同的ε衰减策略，如指数衰减、线性衰减等，观察对训练效果的影响。
添加动态环境：创建一个障碍物会移动的环境，考验智能体的适应能力。
多智能体场景：设计一个有两个智能体协作或竞争的环境。
实现其他RL算法：尝试实现SARSA算法，并与Q-Learning进行比较。

💡 提示：强化学习算法的调试往往比监督学习更困难，因为其性能依赖于与环境的交互。建议在简单环境中充分测试算法后再扩展到复杂场景。