LangGraph实战2026:构建有状态多步骤AI Agent的工程指南

4 阅读1分钟

为什么你的Agent需要LangGraph?

在Agent开发领域,有一道绕不过去的门槛:当你的AI应用需要记住状态、做条件分支、处理循环逻辑的时候,简单的链式调用就不够用了

LangGraph是LangChain团队在2024年推出、2026年已成为行业标配的有状态Agent框架。它的核心思想很简单:把Agent的执行流程建模成一个有向图(Graph),每个节点是一个处理步骤,边定义了流转逻辑,状态在整个图中持久化流动。

这篇文章是2026年最新版LangGraph的实战指南,会带你从最基础的概念一路走到生产级多Agent系统的搭建。


LangGraph核心概念:图、节点、边、状态

状态(State):Agent的记忆

LangGraph中一切的基础是State。State是一个Python字典(或Pydantic模型),贯穿整个Agent的执行周期。每个节点处理State、修改State,下游节点读取更新后的State。

from typing import TypedDict, Annotated, Sequence
from langchain_core.messages import BaseMessage
import operator

class AgentState(TypedDict):
    # 消息历史(使用 operator.add 作为 reducer,实现追加语义)
    messages: Annotated[Sequence[BaseMessage], operator.add]
    # 当前任务
    current_task: str
    # 迭代计数(防止死循环)
    iteration_count: int
    # 最终结果
    final_answer: str | None

Annotated[..., operator.add] 是LangGraph的一个关键设计:通过reducer函数定义State字段如何合并。operator.add 意味着消息会追加到历史,而不是覆盖。

节点(Node):处理单元

节点是普通的Python函数,接收State,返回State的更新:

from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage

llm = ChatOpenAI(model="gpt-4o", temperature=0)

def agent_node(state: AgentState) -> dict:
    """调用LLM处理当前状态"""
    messages = state["messages"]
    response = llm.invoke(messages)
    return {
        "messages": [response],  # 追加到消息历史
        "iteration_count": state["iteration_count"] + 1
    }

def tool_executor(state: AgentState) -> dict:
    """执行工具调用"""
    last_message = state["messages"][-1]
    # 处理工具调用...
    results = execute_tools(last_message.tool_calls)
    return {"messages": results}

边(Edge):流转逻辑

边分为两类:

固定边:无条件跳转

graph.add_edge("agent", "tools")

条件边:根据State动态决定下一步

def should_continue(state: AgentState) -> str:
    """决定是继续工具调用还是输出最终答案"""
    last_message = state["messages"][-1]
    
    # 防止无限循环
    if state["iteration_count"] >= 10:
        return "end"
    
    # 如果有工具调用,执行工具
    if hasattr(last_message, 'tool_calls') and last_message.tool_calls:
        return "tools"
    
    # 否则,Agent已经给出最终回复
    return "end"

graph.add_conditional_edges(
    "agent",
    should_continue,
    {
        "tools": "tool_executor",
        "end": END
    }
)

从零构建:一个完整的研究Agent

让我们构建一个能够搜索网络、阅读页面并综合信息的研究Agent:

from langgraph.graph import StateGraph, END
from langgraph.prebuilt import ToolNode
from langchain_openai import ChatOpenAI
from langchain_community.tools import DuckDuckGoSearchRun, WikipediaQueryRun
from langchain_core.messages import SystemMessage
from typing import TypedDict, Annotated, Sequence
from langchain_core.messages import BaseMessage
import operator

# ── 1. 定义工具 ──
search_tool = DuckDuckGoSearchRun()
wiki_tool = WikipediaQueryRun()
tools = [search_tool, wiki_tool]

# ── 2. 定义状态 ──
class ResearchState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], operator.add]
    research_question: str
    sources: list[str]
    iteration: int

# ── 3. 创建绑定了工具的LLM ──
llm_with_tools = ChatOpenAI(
    model="gpt-4o",
    temperature=0
).bind_tools(tools)

# ── 4. 定义节点 ──
SYSTEM_PROMPT = """你是一位专业的AI研究助手。
当需要查找信息时,使用搜索工具或维基百科。
综合多个来源,给出准确、全面的研究报告。"""

def researcher(state: ResearchState) -> dict:
    messages = [SystemMessage(content=SYSTEM_PROMPT)] + list(state["messages"])
    response = llm_with_tools.invoke(messages)
    return {"messages": [response], "iteration": state["iteration"] + 1}

def route_after_agent(state: ResearchState) -> str:
    last = state["messages"][-1]
    if state["iteration"] >= 8:
        return "end"
    if hasattr(last, "tool_calls") and last.tool_calls:
        return "tools"
    return "end"

# ── 5. 构建图 ──
tool_node = ToolNode(tools)

workflow = StateGraph(ResearchState)
workflow.add_node("researcher", researcher)
workflow.add_node("tools", tool_node)

workflow.set_entry_point("researcher")
workflow.add_conditional_edges("researcher", route_after_agent, {
    "tools": "tools",
    "end": END
})
workflow.add_edge("tools", "researcher")  # 工具执行后回到researcher

# ── 6. 编译并运行 ──
app = workflow.compile()

result = app.invoke({
    "messages": [HumanMessage(content="分析2026年AI Agent的主流技术架构")],
    "research_question": "2026年AI Agent技术架构",
    "sources": [],
    "iteration": 0
})

print(result["messages"][-1].content)

进阶:持久化状态与人工介入

生产环境中,Agent经常需要:

  1. 跨会话记忆:用户下次打开还记得上次的对话
  2. 暂停等待人工:关键决策需要人确认后再继续

LangGraph通过Checkpointer实现这两个需求:

from langgraph.checkpoint.sqlite import SqliteSaver
from langgraph.graph import StateGraph, END, interrupt

# 使用SQLite持久化状态
with SqliteSaver.from_conn_string("agent_memory.db") as memory:
    app = workflow.compile(checkpointer=memory)
    
    # 第一次对话
    config = {"configurable": {"thread_id": "user_123"}}
    result1 = app.invoke(
        {"messages": [HumanMessage(content="我想研究量子计算")]},
        config=config
    )
    
    # 第二次对话(自动载入上次状态)
    result2 = app.invoke(
        {"messages": [HumanMessage(content="重点说说量子纠错")]},
        config=config  # 相同thread_id,自动恢复上下文
    )

人工介入(Human-in-the-loop)

def critical_decision_node(state: AgentState) -> dict:
    """需要人工确认的关键决策节点"""
    proposal = state["messages"][-1].content
    
    # 暂停执行,等待人工输入
    human_input = interrupt({
        "question": "AI提议以下操作,是否批准?",
        "proposal": proposal,
        "action": "approve_or_reject"
    })
    
    if human_input["decision"] == "approve":
        return {"approved": True, "messages": [AIMessage(content="操作已批准,继续执行...")]}
    else:
        return {"approved": False, "messages": [AIMessage(content=f"操作已取消: {human_input.get('reason', '')}")]}

多Agent协作:Supervisor模式

当单个Agent能力不够时,需要多个专业Agent协作。LangGraph的Supervisor模式是当前最成熟的实现方式:

from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel
from typing import Literal

# 专业Agent定义
code_agent = create_specialized_agent("代码专家", code_tools)
research_agent = create_specialized_agent("研究专家", research_tools)
writing_agent = create_specialized_agent("写作专家", writing_tools)

members = ["code_expert", "researcher", "writer"]

# Supervisor:决定派遣哪个Agent
class RouteDecision(BaseModel):
    next: Literal["code_expert", "researcher", "writer", "FINISH"]
    reason: str

supervisor_prompt = ChatPromptTemplate.from_messages([
    ("system", f"""你是一个任务协调器,管理以下专家:{members}
    
    根据任务需求和当前进展,决定下一步派遣哪位专家。
    当任务完成时,回复 FINISH。"""),
    ("human", "当前状态:\n{state}\n\n请决定下一步行动。")
])

def supervisor_node(state: MultiAgentState) -> dict:
    response = supervisor_llm.with_structured_output(RouteDecision).invoke(
        supervisor_prompt.format(state=str(state))
    )
    return {"next_agent": response.next, "supervisor_reason": response.reason}

def route_by_supervisor(state: MultiAgentState) -> str:
    return state["next_agent"]

# 构建多Agent图
multi_agent_graph = StateGraph(MultiAgentState)
multi_agent_graph.add_node("supervisor", supervisor_node)
multi_agent_graph.add_node("code_expert", code_agent)
multi_agent_graph.add_node("researcher", research_agent)
multi_agent_graph.add_node("writer", writing_agent)

multi_agent_graph.set_entry_point("supervisor")
multi_agent_graph.add_conditional_edges("supervisor", route_by_supervisor, {
    "code_expert": "code_expert",
    "researcher": "researcher",
    "writer": "writer",
    "FINISH": END
})
# 每个专家完成后回到supervisor
for member in ["code_expert", "researcher", "writer"]:
    multi_agent_graph.add_edge(member, "supervisor")

生产部署关键要点

1. 错误处理与重试

from tenacity import retry, stop_after_attempt, wait_exponential

def resilient_node(state: AgentState) -> dict:
    @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10))
    def call_with_retry():
        return llm.invoke(state["messages"])
    
    try:
        response = call_with_retry()
        return {"messages": [response]}
    except Exception as e:
        # 错误状态下的优雅降级
        return {"messages": [AIMessage(content=f"处理失败,请重试: {str(e)}")]}

2. 流式输出

async for event in app.astream_events(initial_state, version="v2"):
    if event["event"] == "on_chat_model_stream":
        chunk = event["data"]["chunk"]
        print(chunk.content, end="", flush=True)

3. 可观测性

from langsmith import traceable

@traceable(name="research_agent_run")
def run_agent(query: str) -> str:
    result = app.invoke({"messages": [HumanMessage(content=query)]})
    return result["messages"][-1].content

总结

LangGraph在2026年已经成为构建生产级AI Agent的首选框架,核心优势是:

  1. 有状态:State机制让复杂业务逻辑变得可管理
  2. 可控:条件边、人工介入让Agent行为可预期
  3. 可扩展:从单Agent到多Agent Supervisor,架构平滑升级
  4. 持久化:Checkpointer让跨会话记忆不再困难

掌握LangGraph,是2026年AI工程师的核心竞争力之一。不要等到项目复杂了才学——现在就开始用图的思维建模你的Agent流程。