用户在等待AI回答时的体验差距是巨大的:等待5秒后看到完整回答,vs 0.5秒内开始看到文字逐字出现——后者的用户满意度高出70%以上。流式输出(Streaming)不仅是体验优化,更是AI应用架构的核心能力。本文深入解析LLM流式输出的工程实现,覆盖后端到前端的完整技术链路。
一、流式输出的工作原理
LLM生成文本是逐token的自回归过程——模型每次只预测下一个token,而非一次生成完整句子。流式输出利用这一特性,在每个token生成后立即推送给客户端,而非等到全部完成。
技术实现依赖Server-Sent Events(SSE)或WebSocket:
客户端 服务器 LLM API
| | |
|-- HTTP请求 ----------->| |
| |-- stream=True请求 ------->|
|<-- data: {"text":"你"} |<-- token: "你" ----------|
|<-- data: {"text":"好"} |<-- token: "好" ----------|
|<-- data: {"text":"!"} |<-- token: "!" ----------|
|<-- data: [DONE] |<-- [DONE] ---------------|
二、后端流式实现
2.1 FastAPI + SSE 实现
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
import anthropic
import json
import asyncio
app = FastAPI()
client = anthropic.AsyncAnthropic()
@app.post("/api/chat/stream")
async def chat_stream(request: Request):
"""流式聊天接口(SSE格式)"""
body = await request.json()
messages = body.get("messages", [])
model = body.get("model", "claude-opus-4-7")
async def generate():
try:
async with client.messages.stream(
model=model,
max_tokens=2000,
messages=messages
) as stream:
async for text in stream.text_stream:
# SSE格式:data: {json}\n\n
chunk = json.dumps({
"type": "text_delta",
"text": text
}, ensure_ascii=False)
yield f"data: {chunk}\n\n"
# 发送完成信号
final_message = await stream.get_final_message()
done_data = json.dumps({
"type": "done",
"usage": {
"input_tokens": final_message.usage.input_tokens,
"output_tokens": final_message.usage.output_tokens
}
})
yield f"data: {done_data}\n\n"
except anthropic.APIError as e:
error_data = json.dumps({"type": "error", "message": str(e)})
yield f"data: {error_data}\n\n"
except asyncio.CancelledError:
# 客户端断开连接,正常终止
pass
return StreamingResponse(
generate(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no" # 禁用Nginx缓冲
}
)
2.2 工具调用的流式处理
工具调用的流式处理更复杂——需要在流中解析工具调用参数,执行工具,再继续流式生成:
@app.post("/api/agent/stream")
async def agent_stream(request: Request):
"""带工具调用的Agent流式接口"""
body = await request.json()
messages = body.get("messages", [])
tools = body.get("tools", [])
async def generate():
current_messages = messages.copy()
while True:
tool_calls = []
current_tool_input = {}
current_tool_id = None
current_tool_name = None
async with client.messages.stream(
model="claude-opus-4-7",
max_tokens=2000,
tools=tools,
messages=current_messages
) as stream:
async for event in stream:
if event.type == "content_block_start":
if event.content_block.type == "text":
# 文本块开始,推送信号
yield f"data: {json.dumps({'type': 'text_start'})}\n\n"
elif event.content_block.type == "tool_use":
current_tool_id = event.content_block.id
current_tool_name = event.content_block.name
current_tool_input = {}
# 告知客户端工具调用开始
yield f"data: {json.dumps({'type': 'tool_start', 'name': current_tool_name})}\n\n"
elif event.type == "content_block_delta":
if event.delta.type == "text_delta":
# 推送文本增量
chunk = json.dumps({
"type": "text_delta",
"text": event.delta.text
}, ensure_ascii=False)
yield f"data: {chunk}\n\n"
elif event.delta.type == "input_json_delta":
# 累积工具输入参数(流式JSON片段)
# 实际上在content_block_stop时才能解析完整JSON
pass
elif event.type == "content_block_stop":
if current_tool_id:
# 工具调用参数接收完整,执行工具
yield f"data: {json.dumps({'type': 'tool_executing', 'name': current_tool_name})}\n\n"
tool_calls.append({
"id": current_tool_id,
"name": current_tool_name
})
current_tool_id = None
# 检查停止原因
final_message = await stream.get_final_message()
if final_message.stop_reason == "end_turn":
yield f"data: {json.dumps({'type': 'done'})}\n\n"
break
elif final_message.stop_reason == "tool_use":
# 执行工具并继续对话
tool_results = []
for tc in final_message.content:
if tc.type == "tool_use":
result = await execute_tool(tc.name, tc.input)
tool_results.append({
"type": "tool_result",
"tool_use_id": tc.id,
"content": json.dumps(result, ensure_ascii=False)
})
# 推送工具结果给客户端
yield f"data: {json.dumps({'type': 'tool_result', 'name': tc.name, 'result': str(result)[:200]})}\n\n"
# 更新消息历史,继续下一轮
current_messages.append({"role": "assistant", "content": final_message.content})
current_messages.append({"role": "user", "content": tool_results})
return StreamingResponse(generate(), media_type="text/event-stream",
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"})
2.3 流式输出的错误处理与重试
import asyncio
from typing import AsyncGenerator
class ResilientStreamClient:
"""带重试机制的流式客户端"""
def __init__(self, max_retries: int = 3, retry_delay: float = 1.0):
self.client = anthropic.AsyncAnthropic()
self.max_retries = max_retries
self.retry_delay = retry_delay
async def stream_with_retry(
self,
messages: list,
model: str = "claude-opus-4-7",
max_tokens: int = 2000
) -> AsyncGenerator[str, None]:
"""带自动重试的流式生成"""
last_error = None
accumulated_text = "" # 记录已收到的文本,断点续传
for attempt in range(self.max_retries):
try:
async with self.client.messages.stream(
model=model,
max_tokens=max_tokens,
messages=messages
) as stream:
async for text in stream.text_stream:
accumulated_text += text
yield text
return # 成功完成,退出
except anthropic.RateLimitError as e:
# 频率限制,等待后重试
wait_time = self.retry_delay * (2 ** attempt)
yield f"\n[等待重试: {wait_time:.0f}秒]"
await asyncio.sleep(wait_time)
last_error = e
except anthropic.APITimeoutError as e:
if accumulated_text:
# 有部分内容,可以续写
messages = messages + [
{"role": "assistant", "content": accumulated_text},
{"role": "user", "content": "请继续"}
]
yield "\n[连接中断,正在续写...]"
last_error = e
except anthropic.APIConnectionError as e:
await asyncio.sleep(self.retry_delay * (attempt + 1))
last_error = e
yield f"\n[重试{self.max_retries}次后失败: {str(last_error)}]"
三、前端流式消费
3.1 React Hook:useStreamingChat
import { useState, useCallback, useRef } from 'react';
interface Message {
role: 'user' | 'assistant';
content: string;
}
interface StreamChunk {
type: 'text_delta' | 'tool_start' | 'tool_result' | 'done' | 'error';
text?: string;
name?: string;
result?: string;
message?: string;
}
export function useStreamingChat() {
const [messages, setMessages] = useState<Message[]>([]);
const [isStreaming, setIsStreaming] = useState(false);
const [currentResponse, setCurrentResponse] = useState('');
const abortControllerRef = useRef<AbortController | null>(null);
const sendMessage = useCallback(async (userMessage: string) => {
const newMessages: Message[] = [
...messages,
{ role: 'user', content: userMessage }
];
setMessages(newMessages);
setIsStreaming(true);
setCurrentResponse('');
abortControllerRef.current = new AbortController();
try {
const response = await fetch('/api/chat/stream', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ messages: newMessages }),
signal: abortControllerRef.current.signal
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}`);
}
const reader = response.body!.getReader();
const decoder = new TextDecoder();
let fullResponse = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value, { stream: true });
const lines = chunk.split('\n');
for (const line of lines) {
if (!line.startsWith('data: ')) continue;
const data = line.slice(6);
if (data === '[DONE]') break;
try {
const parsed: StreamChunk = JSON.parse(data);
if (parsed.type === 'text_delta' && parsed.text) {
fullResponse += parsed.text;
setCurrentResponse(fullResponse);
} else if (parsed.type === 'done') {
setMessages(prev => [
...prev,
{ role: 'assistant', content: fullResponse }
]);
setCurrentResponse('');
}
} catch (e) {
// 忽略解析错误
}
}
}
} catch (error: any) {
if (error.name !== 'AbortError') {
setCurrentResponse(prev => prev + '\n[发生错误,请重试]');
}
} finally {
setIsStreaming(false);
}
}, [messages]);
const stopStreaming = useCallback(() => {
abortControllerRef.current?.abort();
setIsStreaming(false);
}, []);
return { messages, currentResponse, isStreaming, sendMessage, stopStreaming };
}
3.2 流式Markdown渲染
流式输出时,Markdown文本是逐渐到来的,直接渲染会导致频繁的DOM更新和闪烁。解决方案:
import { useEffect, useState } from 'react';
import { marked } from 'marked';
export function StreamingMarkdown({ text }: { text: string }) {
const [html, setHtml] = useState('');
useEffect(() => {
// 使用requestAnimationFrame避免过度渲染
const rafId = requestAnimationFrame(() => {
// 智能处理未闭合的Markdown语法
const processedText = fixIncompleteMarkdown(text);
setHtml(marked.parse(processedText) as string);
});
return () => cancelAnimationFrame(rafId);
}, [text]);
return (
<div
className="prose max-w-none"
dangerouslySetInnerHTML={{ __html: html }}
/>
);
}
function fixIncompleteMarkdown(text: string): string {
// 处理未闭合的代码块
const codeBlockCount = (text.match(/```/g) || []).length;
if (codeBlockCount % 2 !== 0) {
return text + '\n```';
}
// 处理未闭合的粗体/斜体
const boldCount = (text.match(/\*\*/g) || []).length;
if (boldCount % 2 !== 0) {
return text + '**';
}
return text;
}
四、性能优化:批量合并与节流
// 节流Streaming更新:避免过于频繁的状态更新导致UI卡顿
class StreamingBuffer {
private buffer: string = '';
private onFlush: (text: string) => void;
private flushInterval: number;
private timer: NodeJS.Timeout | null = null;
constructor(onFlush: (text: string) => void, flushIntervalMs: number = 50) {
this.onFlush = onFlush;
this.flushInterval = flushIntervalMs;
}
append(text: string): void {
this.buffer += text;
if (!this.timer) {
this.timer = setInterval(() => this.flush(), this.flushInterval);
}
}
flush(): void {
if (this.buffer) {
this.onFlush(this.buffer);
this.buffer = '';
}
}
destroy(): void {
if (this.timer) {
clearInterval(this.timer);
this.flush();
}
}
}
五、生产部署注意事项
Nginx配置:默认Nginx会缓冲上游响应,导致流式输出变成批量输出。必须禁用缓冲:
location /api/chat/stream {
proxy_pass http://backend:8000;
proxy_buffering off; # 禁用代理缓冲
proxy_cache off; # 禁用缓存
proxy_read_timeout 300s; # 长超时(流式响应时间长)
proxy_send_timeout 300s;
# SSE必要头
proxy_set_header Connection '';
proxy_http_version 1.1;
chunked_transfer_encoding on;
}
超时配置:流式请求时间通常比普通请求长10-30倍,需要相应调整所有层级的超时设置(LB、反向代理、后端框架)。
六、总结
LLM流式输出的工程关键点:
- 后端:使用async生成器实现SSE,正确处理工具调用的流式解析,配置重试与断点续传
- 前端:用EventSource/fetch+ReadableStream消费SSE,节流UI更新,智能处理未闭合Markdown
- 基础设施:Nginx禁用缓冲,配置合理的长超时,健康检查需要单独处理流式连接
- 错误处理:网络中断时的优雅降级,频率限制时的退避重试
流式输出不只是"打字机效果"的视觉优化——它是AI应用架构中直接影响用户感知质量的核心工程能力。