为什么普通RAG需要知识图谱
标准RAG的工作方式是:把文档切块→向量化→相似度检索→喂给LLM。这个流程在大多数场景下表现良好,但有一类查询它天然处理不好:需要理解实体关系的问题。
举个例子:
"张三是李四的上司,李四负责A项目,A项目使用了B供应商的组件,B供应商最近有质量问题,请分析这对张三的工作有什么影响?"
这个问题需要沿着"张三→李四→A项目→B供应商→质量问题"这条关系链推理。向量检索擅长找"语义相似的段落",但不擅长沿关系链做多跳推理。
这就是GraphRAG的用武之地:用知识图谱显式建模实体关系,让RAG能做图结构上的推理。
GraphRAG核心架构
文档语料库
│
▼ 信息抽取
┌──────────────────────────────────┐
│ 知识图谱 │
│ 节点:实体(人、组织、概念、事件) │
│ 边:关系(属于、使用、导致、等) │
│ 属性:实体和关系的详细信息 │
└──────────────────────────────────┘
│
▼ 图检索
┌──────────────────────────────────┐
│ GraphRAG检索引擎 │
│ 1. 识别查询中的实体 │
│ 2. 在图中定位相关节点 │
│ 3. 图遍历获取关联子图 │
│ 4. 将子图转化为自然语言上下文 │
└──────────────────────────────────┘
│
▼ 生成
LLM(带图结构上下文)
第一步:从文档构建知识图谱
使用LLM抽取实体和关系
import json
from openai import AsyncOpenAI
from typing import NamedTuple
client = AsyncOpenAI()
class Entity(NamedTuple):
name: str
type: str # Person/Organization/Product/Event/Concept
description: str
class Relation(NamedTuple):
source: str # 源实体名
target: str # 目标实体名
relation: str # 关系类型
description: str
EXTRACTION_PROMPT = """请从以下文本中抽取实体和关系,构建知识图谱。
输出JSON格式:
{
"entities": [
{"name": "实体名称", "type": "类型(Person/Organization/Product/Event/Concept)", "description": "简短描述"}
],
"relations": [
{"source": "源实体名", "target": "目标实体名", "relation": "关系(如uses/creates/belongs_to/causes/reports_to等)", "description": "关系描述"}
]
}
文本:
{text}"""
async def extract_graph_from_text(text: str) -> tuple[list[Entity], list[Relation]]:
"""从文本中抽取知识图谱元素"""
response = await client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "user", "content": EXTRACTION_PROMPT.format(text=text)}
],
response_format={"type": "json_object"},
temperature=0
)
data = json.loads(response.choices[0].message.content)
entities = [Entity(**e) for e in data.get("entities", [])]
relations = [Relation(**r) for r in data.get("relations", [])]
return entities, relations
# 批量处理文档
async def build_knowledge_graph_from_docs(documents: list[str]):
all_entities = {}
all_relations = []
for doc in documents:
# 文档分块处理(知识抽取不需要太小的块)
chunks = split_text(doc, chunk_size=2000, overlap=200)
for chunk in chunks:
entities, relations = await extract_graph_from_text(chunk)
# 实体去重(同名实体合并)
for entity in entities:
if entity.name not in all_entities:
all_entities[entity.name] = entity
all_relations.extend(relations)
return all_entities, all_relations
使用Neo4j存储图谱
from neo4j import AsyncGraphDatabase
class KnowledgeGraphStore:
def __init__(self, uri: str, user: str, password: str):
self.driver = AsyncGraphDatabase.driver(uri, auth=(user, password))
async def insert_entities(self, entities: list[Entity]):
"""批量插入实体节点"""
async with self.driver.session() as session:
await session.run(
"""
UNWIND $entities AS entity
MERGE (e:Entity {name: entity.name})
SET e.type = entity.type,
e.description = entity.description,
e.updated_at = datetime()
""",
entities=[e._asdict() for e in entities]
)
async def insert_relations(self, relations: list[Relation]):
"""批量插入关系边"""
async with self.driver.session() as session:
await session.run(
"""
UNWIND $relations AS rel
MATCH (s:Entity {name: rel.source})
MATCH (t:Entity {name: rel.target})
MERGE (s)-[r:RELATES {type: rel.relation}]->(t)
SET r.description = rel.description,
r.updated_at = datetime()
""",
relations=[r._asdict() for r in relations]
)
async def get_entity_neighborhood(
self,
entity_name: str,
max_hops: int = 2,
max_nodes: int = 50
) -> dict:
"""获取实体的邻域子图(用于RAG上下文)"""
async with self.driver.session() as session:
result = await session.run(
"""
MATCH (center:Entity {name: $name})
CALL {
WITH center
MATCH path = (center)-[*1..{hops}]-(neighbor)
RETURN neighbor, relationships(path) as rels
LIMIT {limit}
}
RETURN center, collect(DISTINCT neighbor) as neighbors,
collect(DISTINCT rels) as all_rels
""".replace("{hops}", str(max_hops)).replace("{limit}", str(max_nodes)),
name=entity_name
)
record = await result.single()
if not record:
return {"error": f"Entity '{entity_name}' not found"}
return {
"center": dict(record["center"]),
"neighbors": [dict(n) for n in record["neighbors"]],
"relations": record["all_rels"]
}
第二步:GraphRAG检索
实体识别 + 图遍历
class GraphRAGRetriever:
def __init__(self, graph_store: KnowledgeGraphStore, vector_store, llm):
self.graph = graph_store
self.vectors = vector_store
self.llm = llm
async def retrieve(self, query: str) -> str:
"""GraphRAG检索:返回结构化的图上下文"""
# 1. 从查询中识别关键实体
entities = await self._extract_query_entities(query)
# 2. 向量检索相关文档段落(作为补充)
vector_results = await self.vectors.similarity_search(query, k=5)
# 3. 图遍历:获取实体的关联子图
graph_contexts = []
for entity in entities[:3]: # 最多处理3个核心实体
neighborhood = await self.graph.get_entity_neighborhood(
entity_name=entity,
max_hops=2
)
if "error" not in neighborhood:
graph_context = self._subgraph_to_text(neighborhood)
graph_contexts.append(graph_context)
# 4. 融合向量检索和图检索结果
combined_context = self._merge_contexts(
vector_contexts=[r.page_content for r in vector_results],
graph_contexts=graph_contexts
)
return combined_context
async def _extract_query_entities(self, query: str) -> list[str]:
"""从查询中抽取关键实体"""
response = await self.llm.complete(f"""
从以下查询中抽取关键实体名称(人名、组织名、产品名、概念名等)。
只返回实体名称列表,用逗号分隔。
查询:{query}
""")
return [e.strip() for e in response.split(",") if e.strip()]
def _subgraph_to_text(self, subgraph: dict) -> str:
"""将子图转化为自然语言,便于LLM理解"""
center = subgraph["center"]
text = f"关于 {center['name']}({center.get('type', '实体')})的知识图谱信息:\n\n"
for relation_path in subgraph.get("relations", [])[:20]: # 最多20条关系
for rel in relation_path:
text += f"- {rel['start_node_name']} --[{rel['type']}]--> {rel['end_node_name']}"
if rel.get("description"):
text += f"({rel['description']})"
text += "\n"
return text
def _merge_contexts(self, vector_contexts: list, graph_contexts: list) -> str:
"""融合两种检索结果"""
merged = ""
if graph_contexts:
merged += "## 知识图谱信息\n\n"
merged += "\n---\n".join(graph_contexts)
merged += "\n\n"
if vector_contexts:
merged += "## 相关文档片段\n\n"
merged += "\n---\n".join(vector_contexts[:3]) # 最多3段
return merged
第三步:图增强的问答
class GraphRAGQA:
def __init__(self, retriever: GraphRAGRetriever, llm):
self.retriever = retriever
self.llm = llm
async def answer(self, question: str) -> dict:
"""完整的GraphRAG问答流程"""
# 1. 检索相关上下文(图+向量)
context = await self.retriever.retrieve(question)
# 2. 构建包含图上下文的提示
prompt = f"""请基于以下知识图谱和文档信息回答问题。
{context}
---
问题:{question}
回答时请:
1. 充分利用知识图谱中的实体关系信息
2. 如果问题涉及多实体关系,请明确说明推理路径
3. 如果信息不足,请说明缺少哪些信息"""
response = await self.llm.complete(prompt)
# 3. 提取引用的实体(用于前端展示知识图谱)
cited_entities = await self._extract_cited_entities(response)
return {
"answer": response,
"cited_entities": cited_entities,
"context_used": {
"graph_nodes": len(context.split("-->")) - 1,
"vector_chunks": context.count("---")
}
}
微软GraphRAG的工程化实现
微软开源的GraphRAG项目提供了生产级的实现,值得参考:
# 安装
pip install graphrag
# 初始化项目
mkdir my-graphrag && cd my-graphrag
python -m graphrag.index --init --root .
# settings.yml - 核心配置
llm:
api_key: ${GRAPHRAG_API_KEY}
type: openai_chat
model: gpt-4o
embeddings:
llm:
api_key: ${GRAPHRAG_API_KEY}
type: openai_embedding
model: text-embedding-3-small
chunks:
size: 300
overlap: 100
entity_extraction:
max_gleanings: 1 # 多次抽取以提高召回率
community_reports:
max_length: 2000 # 社区摘要长度
# 使用GraphRAG做查询
import asyncio
from graphrag.query.api import local_search, global_search
async def query_with_graphrag(question: str, mode="local"):
if mode == "local":
# Local search:从特定实体出发,适合具体问题
result = await local_search(
query=question,
root="./",
community_level=2,
)
else:
# Global search:全局理解,适合总结性问题
result = await global_search(
query=question,
root="./",
community_level=2,
)
return result.response
Local vs Global的选择:
local search:"张三和李四的关系是什么?" → 精确实体关系查询global search:"这份报告的主要主题是什么?" → 全局语义理解
适用场景与局限
最适合GraphRAG的场景
✅ 企业知识管理:组织架构、项目关系、人员关联 ✅ 科研文献分析:研究者→论文→机构→引用关系 ✅ 供应链分析:供应商→组件→产品→客户的关系链 ✅ 合规审查:实体关联、交易路径、风险传导分析 ✅ 多跳问答:需要沿关系链推理的复杂问题
GraphRAG的代价
- 构建成本高:抽取知识图谱需要大量LLM调用,成本是普通RAG的5-10倍
- 维护复杂:图谱需要随文档更新保持同步
- 图谱质量依赖抽取质量:LLM抽取错误的关系会直接影响推理
- 延迟更高:图遍历+向量检索的复合延迟
结论:GraphRAG不是普通RAG的替代品,而是在特定高价值场景下的增强工具。先评估你的用例是否真的需要关系推理,再决定是否引入图谱层的复杂性。
结语
GraphRAG代表了RAG技术的一个重要进化方向:从"找相关段落"到"理解实体关系网络"。随着Neo4j、Kuzu等图数据库的成熟,以及微软GraphRAG等开源工具的完善,在2026年将知识图谱整合进RAG系统已经是工程上可行的选择。
关键是选对场景:当你的用户在问"A和B有什么关系"、"X影响Y会通过什么路径"这类问题时,GraphRAG能给出传统RAG无法提供的深度答案。