AI工程师的上下文管理术:让长对话不失忆的工程实践

2 阅读6分钟

LLM最大的局限之一,是有限的上下文窗口。GPT-4o有128K token,Gemini 1.5 Pro有100万token——听起来很大,但实际生产中,长对话积累、知识库检索内容、工具调用结果……很快就能填满。更根本的问题是:不是塞满上下文就好,而是如何让有限的上下文空间承载最有价值的信息

上下文管理的四个核心问题

  1. 记忆问题:LLM默认无状态,每次请求独立,无法记住上一次对话
  2. 容量问题:即使有百万token窗口,塞满了速度变慢、成本飙升
  3. 相关性问题:不是所有历史信息都有价值,旧信息可能干扰新回复
  4. 一致性问题:长对话中,模型可能"忘记"早先约定的规则或用户偏好

对话历史的管理策略

策略1:滑动窗口(Sliding Window)

最简单的方案:只保留最近N轮对话,超出则丢弃最旧的:

from collections import deque
from typing import Optional

class SlidingWindowMemory:
    """滑动窗口对话记忆"""
    
    def __init__(self, max_messages: int = 20, system_prompt: str = ""):
        self.max_messages = max_messages
        self.system_prompt = system_prompt
        self.messages = deque(maxlen=max_messages)
    
    def add_message(self, role: str, content: str):
        self.messages.append({"role": role, "content": content})
    
    def get_messages(self) -> list[dict]:
        """获取完整消息列表(含系统提示)"""
        messages = []
        if self.system_prompt:
            messages.append({"role": "system", "content": self.system_prompt})
        messages.extend(list(self.messages))
        return messages
    
    def chat(self, user_input: str, client, model: str = "gpt-4o") -> str:
        self.add_message("user", user_input)
        
        response = client.chat.completions.create(
            model=model,
            messages=self.get_messages(),
        )
        
        assistant_reply = response.choices[0].message.content
        self.add_message("assistant", assistant_reply)
        return assistant_reply

优点:简单、可预测、成本可控 缺点:丢失历史信息,用户提到"你之前说的XXX"可能无法响应

策略2:摘要压缩(Summary Compression)

当对话超过阈值时,将旧对话压缩为摘要:

from openai import OpenAI

class SummaryCompressedMemory:
    """带摘要压缩的对话记忆"""
    
    def __init__(self, client: OpenAI, max_recent_messages: int = 10,
                 compression_threshold: int = 20, system_prompt: str = ""):
        self.client = client
        self.max_recent = max_recent_messages
        self.compression_threshold = compression_threshold
        self.system_prompt = system_prompt
        self.summary = ""          # 历史摘要
        self.recent_messages = []  # 最近的完整消息
    
    def _compress(self):
        """将旧消息压缩为摘要"""
        # 取出最旧的一半消息进行压缩
        to_compress = self.recent_messages[:len(self.recent_messages)//2]
        remaining = self.recent_messages[len(self.recent_messages)//2:]
        
        conversation_text = "\n".join([
            f"{m['role'].upper()}: {m['content']}" for m in to_compress
        ])
        
        compress_prompt = f"""
        已有历史摘要:
        {self.summary}
        
        新增对话:
        {conversation_text}
        
        请将上述内容合并为一段简洁的对话摘要,保留关键信息、用户偏好和重要决策:
        """
        
        response = self.client.chat.completions.create(
            model="gpt-4o-mini",  # 用便宜的小模型做压缩
            messages=[{"role": "user", "content": compress_prompt}]
        )
        
        self.summary = response.choices[0].message.content
        self.recent_messages = remaining
    
    def add_message(self, role: str, content: str):
        self.recent_messages.append({"role": role, "content": content})
        
        # 超过阈值时触发压缩
        if len(self.recent_messages) >= self.compression_threshold:
            self._compress()
    
    def get_messages(self) -> list[dict]:
        messages = []
        
        # 系统提示
        system_content = self.system_prompt
        if self.summary:
            system_content += f"\n\n## 对话历史摘要\n{self.summary}"
        
        if system_content:
            messages.append({"role": "system", "content": system_content})
        
        # 最近的完整消息
        messages.extend(self.recent_messages[-self.max_recent:])
        return messages

策略3:重要性过滤(Importance Filtering)

不是所有消息都同等重要,根据重要性动态选择保留哪些:

class ImportanceFilteredMemory:
    """基于重要性评分的记忆管理"""
    
    IMPORTANCE_PROMPT = """
    评估以下对话消息的重要性(0-10分),考虑:
    - 包含用户关键偏好或需求(+3)
    - 包含关键决策或约定(+3)
    - 包含具体数据或代码(+2)
    - 是普通闲聊(-2)
    - 已被后续消息覆盖(-2)
    
    消息:{message}
    只输出一个0到10的整数:
    """
    
    def __init__(self, client, max_total_tokens: int = 4000):
        self.client = client
        self.max_tokens = max_total_tokens
        self.messages_with_scores = []
    
    def _estimate_tokens(self, text: str) -> int:
        return len(text) // 3  # 粗略估算:中文约1字=1token
    
    def _score_message(self, message: dict) -> float:
        """对消息重要性打分"""
        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{
                "role": "user",
                "content": self.IMPORTANCE_PROMPT.format(
                    message=f"{message['role']}: {message['content']}"
                )
            }]
        )
        try:
            return float(response.choices[0].message.content.strip())
        except:
            return 5.0
    
    def add_message(self, role: str, content: str, auto_score: bool = False):
        message = {"role": role, "content": content}
        score = self._score_message(message) if auto_score else 5.0
        self.messages_with_scores.append({
            "message": message,
            "score": score,
            "tokens": self._estimate_tokens(content)
        })
    
    def get_messages(self, budget_tokens: int = None) -> list[dict]:
        """在token预算内,优先选择高分消息"""
        budget = budget_tokens or self.max_tokens
        
        # 最近消息强制保留(保证对话连贯性)
        recent = self.messages_with_scores[-4:]
        recent_tokens = sum(m["tokens"] for m in recent)
        remaining_budget = budget - recent_tokens
        
        # 其余消息按分数排序,在预算内选择
        older = self.messages_with_scores[:-4]
        older_sorted = sorted(older, key=lambda x: x["score"], reverse=True)
        
        selected = []
        for item in older_sorted:
            if remaining_budget >= item["tokens"]:
                selected.append(item)
                remaining_budget -= item["tokens"]
        
        # 按原始顺序重排
        all_selected_indices = set(id(m) for m in selected)
        ordered = [m["message"] for m in self.messages_with_scores 
                   if id(m) in all_selected_indices or m in recent]
        
        return ordered

外部记忆:用向量数据库存储长期记忆

对于需要跨会话记忆的场景,必须使用外部记忆存储:

from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from datetime import datetime
import json

class VectorMemoryStore:
    """基于向量数据库的长期记忆"""
    
    def __init__(self, user_id: str, persist_dir: str = "./memory_store"):
        self.user_id = user_id
        self.persist_dir = persist_dir
        self.embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
        self._load_or_create()
    
    def _load_or_create(self):
        import os
        store_path = f"{self.persist_dir}/{self.user_id}"
        if os.path.exists(store_path):
            self.vectorstore = FAISS.load_local(store_path, self.embeddings)
        else:
            os.makedirs(store_path, exist_ok=True)
            self.vectorstore = FAISS.from_texts(
                ["初始化记忆存储"],
                self.embeddings,
                metadatas=[{"type": "init"}]
            )
    
    def save_memory(self, content: str, memory_type: str = "conversation"):
        """保存一条记忆"""
        self.vectorstore.add_texts(
            texts=[content],
            metadatas=[{
                "user_id": self.user_id,
                "type": memory_type,
                "timestamp": datetime.now().isoformat(),
            }]
        )
        # 持久化
        self.vectorstore.save_local(f"{self.persist_dir}/{self.user_id}")
    
    def retrieve_relevant(self, query: str, k: int = 5) -> list[str]:
        """检索相关记忆"""
        docs = self.vectorstore.similarity_search(query, k=k)
        return [doc.page_content for doc in docs 
                if doc.metadata.get("type") != "init"]
    
    def build_memory_context(self, current_query: str) -> str:
        """构建记忆上下文注入到系统提示"""
        memories = self.retrieve_relevant(current_query)
        if not memories:
            return ""
        
        memory_text = "\n".join([f"- {m}" for m in memories])
        return f"""
## 相关历史记忆
以下是与当前对话相关的历史信息:
{memory_text}

(这些信息来自过去的对话,可能对当前回复有参考价值)
"""

实体记忆:跟踪对话中的关键实体

class EntityMemory:
    """跟踪对话中的关键实体(人名、项目名、偏好等)"""
    
    def __init__(self, client: OpenAI):
        self.client = client
        self.entities = {}  # {entity_name: description}
    
    def extract_and_update(self, user_message: str, assistant_message: str):
        """从对话中提取实体信息"""
        prompt = f"""
        从以下对话中提取关键实体信息(人名、项目名、用户偏好、重要事实等)。
        如果已有实体有新信息,更新它;如果是全新实体,添加它。
        
        已知实体:{json.dumps(self.entities, ensure_ascii=False)}
        
        用户:{user_message}
        助手:{assistant_message}
        
        以JSON格式输出需要更新的实体(不需要变化的不要输出):
        {{"实体名": "实体描述", ...}}
        如果没有新实体信息,输出空对象:{{}}
        """
        
        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        updates = json.loads(response.choices[0].message.content)
        self.entities.update(updates)
    
    def get_context(self) -> str:
        if not self.entities:
            return ""
        entity_text = "\n".join([f"- {k}: {v}" for k, v in self.entities.items()])
        return f"## 已知实体信息\n{entity_text}"

综合应用:完整的记忆管理系统

将上述策略组合为生产可用的记忆系统:

class ProductionMemorySystem:
    """生产级对话记忆系统"""
    
    def __init__(self, client: OpenAI, user_id: str, system_prompt: str = ""):
        self.client = client
        self.base_system = system_prompt
        
        # 短期记忆:滑动窗口
        self.recent_history = deque(maxlen=20)
        
        # 中期记忆:摘要压缩
        self.session_summary = ""
        
        # 长期记忆:向量存储
        self.vector_memory = VectorMemoryStore(user_id)
        
        # 实体记忆
        self.entity_memory = EntityMemory(client)
    
    def build_messages(self, user_input: str) -> list[dict]:
        """构建完整的消息列表"""
        
        # 1. 从长期记忆检索相关信息
        long_term_context = self.vector_memory.build_memory_context(user_input)
        
        # 2. 获取实体信息
        entity_context = self.entity_memory.get_context()
        
        # 3. 构建系统提示
        system_parts = [self.base_system]
        if long_term_context:
            system_parts.append(long_term_context)
        if entity_context:
            system_parts.append(entity_context)
        if self.session_summary:
            system_parts.append(f"## 本次会话摘要\n{self.session_summary}")
        
        system_message = "\n\n".join(filter(None, system_parts))
        
        messages = []
        if system_message:
            messages.append({"role": "system", "content": system_message})
        messages.extend(list(self.recent_history))
        messages.append({"role": "user", "content": user_input})
        
        return messages
    
    def chat(self, user_input: str) -> str:
        messages = self.build_messages(user_input)
        
        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=messages,
        )
        
        assistant_reply = response.choices[0].message.content
        
        # 更新各层记忆
        self.recent_history.append({"role": "user", "content": user_input})
        self.recent_history.append({"role": "assistant", "content": assistant_reply})
        self.entity_memory.extract_and_update(user_input, assistant_reply)
        
        # 定期保存重要信息到长期记忆
        self._maybe_save_to_long_term(user_input, assistant_reply)
        
        return assistant_reply
    
    def _maybe_save_to_long_term(self, user_msg: str, assistant_msg: str):
        """判断是否需要保存到长期记忆"""
        # 简单启发式:用户消息超过50字或包含关键词
        keywords = ["记住", "以后", "我的偏好", "一直", "总是"]
        should_save = (len(user_msg) > 50 or 
                      any(kw in user_msg for kw in keywords))
        
        if should_save:
            memory_content = f"用户说:{user_msg}\n助手回复:{assistant_msg[:200]}..."
            self.vector_memory.save_memory(memory_content)

工程实践小结

上下文管理没有银弹,根据场景选择合适策略:

场景推荐方案
简单对话机器人滑动窗口(20轮)
长期助手(跨会话)向量记忆 + 摘要压缩
需要记住用户信息实体记忆
复杂多任务Agent综合记忆系统
成本敏感场景重要性过滤 + 小模型压缩

核心原则:不是把所有信息塞进上下文,而是把最相关的信息放在最合适的位置。好的上下文管理是让模型用有限的注意力处理最重要的信息。