第14章 知识库与信息管理MCP应用
前言
知识是企业最宝贵的资产。本章展示如何通过MCP构建企业知识库和研究助手系统,让LLM成为智能知识顾问。
14.1 案例1:企业知识库智能助手
14.1.1 应用场景
graph TB
A["企业知识库"] --> B["文档库"]
A --> C["Wiki系统"]
A --> D["FAQ库"]
A --> E["最佳实践库"]
F["知识需求"] --> F1["新员工培训"]
F --> F2["日常问题解答"]
F --> F3["知识维护"]
F --> F4["知识更新"]
G["Claude KBS"] --> G1["智能搜索"]
G --> G2["知识推荐"]
G --> G3["知识生成"]
G --> G4["知识组织"]
G1 --> F1
G2 --> F2
G3 --> F3
G4 --> F4
14.1.2 实现架构
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
from datetime import datetime
from enum import Enum
import json
class ContentType(Enum):
"""内容类型"""
DOCUMENT = "document"
FAQ = "faq"
WIKI = "wiki"
BEST_PRACTICE = "best_practice"
PROCEDURE = "procedure"
@dataclass
class KnowledgeItem:
"""知识项"""
id: str
title: str
content: str
content_type: ContentType
tags: List[str]
created_at: datetime
updated_at: datetime
author: str
category: str
related_items: List[str] = None
view_count: int = 0
helpful_count: int = 0
unhelpful_count: int = 0
class KnowledgeGraph:
"""知识图谱"""
def __init__(self):
self.nodes: Dict[str, KnowledgeItem] = {}
self.edges: Dict[str, List[str]] = {}
self.tags_index: Dict[str, List[str]] = {}
self.category_index: Dict[str, List[str]] = {}
def add_item(self, item: KnowledgeItem):
"""添加知识项"""
self.nodes[item.id] = item
# 构建标签索引
for tag in item.tags:
if tag not in self.tags_index:
self.tags_index[tag] = []
self.tags_index[tag].append(item.id)
# 构建分类索引
if item.category not in self.category_index:
self.category_index[item.category] = []
self.category_index[item.category].append(item.id)
def find_related_items(self, item_id: str, depth: int = 2) -> List[str]:
"""
查找相关知识项
Args:
item_id: 知识项ID
depth: 搜索深度
Returns:
相关项ID列表
"""
if item_id not in self.nodes:
return []
item = self.nodes[item_id]
related = set()
# 基于标签查找相关项
for tag in item.tags:
related.update(self.tags_index.get(tag, []))
# 基于分类查找相关项
related.update(self.category_index.get(item.category, []))
# 移除自身
related.discard(item_id)
return list(related)[:10] # 限制返回数量
def search_by_tags(self, tags: List[str], operator: str = "OR") -> List[str]:
"""
基于标签搜索
Args:
tags: 标签列表
operator: 操作符 (OR/AND)
Returns:
匹配的知识项ID列表
"""
if not tags:
return []
if operator == "OR":
results = set()
for tag in tags:
results.update(self.tags_index.get(tag, []))
return list(results)
else: # AND
results = set(self.tags_index.get(tags[0], []))
for tag in tags[1:]:
results &= set(self.tags_index.get(tag, []))
return list(results)
def get_statistics(self) -> Dict:
"""获取知识库统计"""
return {
"total_items": len(self.nodes),
"total_tags": len(self.tags_index),
"total_categories": len(self.category_index),
"items_by_type": self._count_by_type(),
"most_viewed": self._get_top_items("view_count", 10),
"most_helpful": self._get_top_items("helpful_count", 10)
}
def _count_by_type(self) -> Dict:
"""按类型计数"""
counts = {}
for item in self.nodes.values():
content_type = item.content_type.value
counts[content_type] = counts.get(content_type, 0) + 1
return counts
def _get_top_items(self, field: str, limit: int = 10) -> List[Dict]:
"""获取热门项"""
sorted_items = sorted(
self.nodes.values(),
key=lambda x: getattr(x, field, 0),
reverse=True
)
return [{
"id": item.id,
"title": item.title,
"score": getattr(item, field, 0)
} for item in sorted_items[:limit]]
class KnowledgeSearchEngine:
"""知识搜索引擎"""
def __init__(self, knowledge_graph: KnowledgeGraph):
self.graph = knowledge_graph
self.search_history: List[Dict] = []
async def full_text_search(self, query: str,
filters: Dict = None, limit: int = 20) -> List[Dict]:
"""
全文搜索
Args:
query: 搜索查询
filters: 过滤条件
limit: 返回数限制
Returns:
搜索结果列表
"""
query_lower = query.lower()
results = []
# 遍历所有知识项进行匹配
for item_id, item in self.graph.nodes.items():
# 标题匹配得分最高
title_score = 0
if query_lower in item.title.lower():
title_score = 10
# 内容匹配
content_score = item.content.lower().count(query_lower) if query_lower in item.content.lower() else 0
# 标签匹配
tag_score = 5 if any(query_lower in tag.lower() for tag in item.tags) else 0
total_score = title_score + content_score + tag_score
if total_score > 0:
# 应用过滤
if self._apply_filters(item, filters):
results.append({
"id": item.id,
"title": item.title,
"category": item.category,
"content_type": item.content_type.value,
"score": total_score,
"tags": item.tags,
"updated_at": item.updated_at.isoformat()
})
# 按相关性排序
results.sort(key=lambda x: x["score"], reverse=True)
# 记录搜索历史
self.search_history.append({
"query": query,
"results_count": len(results),
"timestamp": datetime.now().isoformat()
})
return results[:limit]
def _apply_filters(self, item: KnowledgeItem, filters: Dict = None) -> bool:
"""应用过滤条件"""
if not filters:
return True
if "category" in filters and item.category != filters["category"]:
return False
if "content_type" in filters:
content_type = ContentType(filters["content_type"])
if item.content_type != content_type:
return False
if "tags" in filters:
required_tags = filters["tags"]
if not any(tag in item.tags for tag in required_tags):
return False
return True
class KnowledgeAssistant:
"""知识助手"""
def __init__(self, knowledge_graph: KnowledgeGraph):
self.graph = knowledge_graph
self.search_engine = KnowledgeSearchEngine(knowledge_graph)
async def answer_question(self, question: str) -> Dict:
"""
回答问题
Args:
question: 用户问题
Returns:
回答结果
"""
# 搜索相关知识
search_results = await self.search_engine.full_text_search(question, limit=5)
if not search_results:
return {
"found": False,
"message": "No relevant knowledge found",
"suggestions": self._get_suggestions(question)
}
# 组织回答
top_result = search_results[0]
item = self.graph.nodes[top_result["id"]]
# 更新浏览计数
item.view_count += 1
return {
"found": True,
"answer": {
"title": item.title,
"content": item.content,
"category": item.category,
"tags": item.tags,
"author": item.author,
"updated_at": item.updated_at.isoformat()
},
"related_items": [
{
"id": rid,
"title": self.graph.nodes[rid].title
}
for rid in item.related_items[:3]
] if item.related_items else [],
"confidence": min(top_result["score"] / 10, 1.0)
}
def _get_suggestions(self, question: str) -> List[str]:
"""获取建议"""
# 基于问题关键词提供分类建议
suggestions = []
for category in self.graph.category_index.keys():
if any(word in question.lower() for word in category.split()):
suggestions.append(category)
return suggestions[:3]
async def get_onboarding_path(self, role: str) -> Dict:
"""
获取新员工培训路径
Args:
role: 员工角色
Returns:
培训路径
"""
# 根据角色推荐相关知识
training_items = []
for item in self.graph.nodes.values():
if item.content_type == ContentType.PROCEDURE:
if any(tag.lower() == role.lower() for tag in item.tags):
training_items.append({
"id": item.id,
"title": item.title,
"category": item.category,
"duration_minutes": self._estimate_reading_time(item.content)
})
return {
"role": role,
"training_count": len(training_items),
"estimated_hours": sum(item["duration_minutes"] for item in training_items) / 60,
"items": sorted(training_items, key=lambda x: x["duration_minutes"])
}
def _estimate_reading_time(self, content: str) -> int:
"""估计阅读时间(分钟)"""
word_count = len(content.split())
# 平均阅读速度 250 words/minute
return max(1, word_count // 250)
14.2 案例2:学术研究论文助手
14.2.1 论文管理系统
@dataclass
class Paper:
"""论文"""
id: str
title: str
authors: List[str]
year: int
abstract: str
keywords: List[str]
citations: List[str]
content: str
references: List[Dict]
doi: Optional[str] = None
class PaperRepository:
"""论文仓库"""
def __init__(self):
self.papers: Dict[str, Paper] = {}
self.citation_graph: Dict[str, List[str]] = {} # 引用关系图
def add_paper(self, paper: Paper):
"""添加论文"""
self.papers[paper.id] = paper
# 建立引用关系
if paper.citations:
self.citation_graph[paper.id] = paper.citations
async def find_related_papers(self, paper_id: str,
similarity_threshold: float = 0.5) -> List[Dict]:
"""
查找相关论文
Args:
paper_id: 论文ID
similarity_threshold: 相似度阈值
Returns:
相关论文列表
"""
if paper_id not in self.papers:
return []
source_paper = self.papers[paper_id]
related = []
# 基于关键词计算相似度
source_keywords = set(source_paper.keywords)
for pid, paper in self.papers.items():
if pid == paper_id:
continue
paper_keywords = set(paper.keywords)
# Jaccard相似度
intersection = len(source_keywords & paper_keywords)
union = len(source_keywords | paper_keywords)
similarity = intersection / union if union > 0 else 0
if similarity >= similarity_threshold:
related.append({
"id": pid,
"title": paper.title,
"year": paper.year,
"similarity": similarity,
"matching_keywords": list(source_keywords & paper_keywords)
})
# 按相似度排序
related.sort(key=lambda x: x["similarity"], reverse=True)
return related[:10]
def analyze_citation_impact(self, paper_id: str) -> Dict:
"""
分析论文的引用影响力
Args:
paper_id: 论文ID
Returns:
影响力分析结果
"""
paper = self.papers.get(paper_id)
if not paper:
return {}
# 计算被引用次数
cited_by = sum(1 for citations in self.citation_graph.values()
if paper_id in citations)
# 计算引用深度(论文引用了多少其他论文)
citation_depth = len(paper.citations)
return {
"paper_id": paper_id,
"title": paper.title,
"year": paper.year,
"cited_by_count": cited_by,
"citation_count": citation_depth,
"h_index_contribution": min(cited_by, citation_depth),
"impact_level": self._calculate_impact_level(cited_by, citation_depth)
}
def _calculate_impact_level(self, cited_by: int, citation_depth: int) -> str:
"""计算影响力等级"""
score = cited_by + citation_depth * 0.5
if score >= 50:
return "high"
elif score >= 20:
return "medium"
else:
return "low"
14.3 高级功能:嵌入式搜索与推荐
class EmbeddingBasedSearch:
"""基于嵌入的搜索"""
def __init__(self):
self.embeddings: Dict[str, List[float]] = {}
self.documents: Dict[str, KnowledgeItem] = {}
def compute_similarity(self, embedding1: List[float],
embedding2: List[float]) -> float:
"""
计算两个嵌入的相似度(余弦相似度)
Args:
embedding1: 嵌入向量1
embedding2: 嵌入向量2
Returns:
相似度得分 (0-1)
"""
import math
dot_product = sum(e1 * e2 for e1, e2 in zip(embedding1, embedding2))
norm1 = math.sqrt(sum(e ** 2 for e in embedding1))
norm2 = math.sqrt(sum(e ** 2 for e in embedding2))
if norm1 == 0 or norm2 == 0:
return 0
return dot_product / (norm1 * norm2)
async def semantic_search(self, query: str, query_embedding: List[float],
limit: int = 10) -> List[Dict]:
"""
语义搜索
Args:
query: 查询文本
query_embedding: 查询嵌入
limit: 返回数量
Returns:
搜索结果
"""
results = []
for doc_id, embedding in self.embeddings.items():
similarity = self.compute_similarity(query_embedding, embedding)
if doc_id in self.documents:
doc = self.documents[doc_id]
results.append({
"id": doc_id,
"title": doc.title,
"similarity": similarity,
"category": doc.category,
"tags": doc.tags
})
# 按相似度排序
results.sort(key=lambda x: x["similarity"], reverse=True)
return results[:limit]
class KnowledgeRecommendationEngine:
"""知识推荐引擎"""
def __init__(self, knowledge_graph: KnowledgeGraph):
self.graph = knowledge_graph
self.user_interactions: Dict[str, List[Dict]] = {}
def record_interaction(self, user_id: str, item_id: str,
action: str, timestamp: datetime = None):
"""
记录用户交互
Args:
user_id: 用户ID
item_id: 知识项ID
action: 操作类型(view, like, share)
timestamp: 时间戳
"""
if user_id not in self.user_interactions:
self.user_interactions[user_id] = []
self.user_interactions[user_id].append({
"item_id": item_id,
"action": action,
"timestamp": timestamp or datetime.now()
})
async def get_personalized_recommendations(self, user_id: str,
limit: int = 5) -> List[Dict]:
"""
获取个性化推荐
Args:
user_id: 用户ID
limit: 推荐数量
Returns:
推荐列表
"""
if user_id not in self.user_interactions:
return []
# 获取用户查看过的项目
viewed_items = [
i["item_id"] for i in self.user_interactions[user_id]
]
# 收集相关项目
recommendations = set()
for item_id in viewed_items:
if item_id in self.graph.nodes:
item = self.graph.nodes[item_id]
# 基于标签推荐
for tag in item.tags:
recommendations.update(self.graph.tags_index.get(tag, []))
# 移除已查看的项目
recommendations -= set(viewed_items)
# 按热度排序
scored_recommendations = [
{
"id": rec_id,
"title": self.graph.nodes[rec_id].title,
"score": self.graph.nodes[rec_id].view_count
}
for rec_id in recommendations if rec_id in self.graph.nodes
]
scored_recommendations.sort(key=lambda x: x["score"], reverse=True)
return scored_recommendations[:limit]
class KnowledgeContributionSystem:
"""知识贡献系统"""
def __init__(self, knowledge_graph: KnowledgeGraph):
self.graph = knowledge_graph
self.pending_contributions: Dict[str, Dict] = {}
self.contribution_history: List[Dict] = []
self.user_contributions: Dict[str, int] = {}
async def submit_contribution(self, user_id: str, content: str,
title: str, category: str,
tags: List[str]) -> Dict:
"""
提交知识贡献
Args:
user_id: 用户ID
content: 内容
title: 标题
category: 分类
tags: 标签
Returns:
提交结果
"""
import uuid
contribution_id = str(uuid.uuid4())
self.pending_contributions[contribution_id] = {
"user_id": user_id,
"content": content,
"title": title,
"category": category,
"tags": tags,
"submitted_at": datetime.now().isoformat(),
"status": "pending",
"reviews": []
}
# 更新用户贡献计数
self.user_contributions[user_id] = self.user_contributions.get(user_id, 0) + 1
return {
"contribution_id": contribution_id,
"status": "submitted",
"message": "Thank you for your contribution! It will be reviewed shortly."
}
async def approve_contribution(self, contribution_id: str) -> Dict:
"""批准贡献"""
if contribution_id not in self.pending_contributions:
return {"error": "Contribution not found"}
contribution = self.pending_contributions[contribution_id]
# 创建新的知识项
import uuid
item_id = str(uuid.uuid4())
item = KnowledgeItem(
id=item_id,
title=contribution["title"],
content=contribution["content"],
content_type=ContentType.DOCUMENT,
tags=contribution["tags"],
created_at=datetime.now(),
updated_at=datetime.now(),
author=contribution["user_id"],
category=contribution["category"]
)
self.graph.add_item(item)
# 记录贡献历史
self.contribution_history.append({
"original_id": contribution_id,
"item_id": item_id,
"user_id": contribution["user_id"],
"approved_at": datetime.now().isoformat()
})
# 删除待处理
del self.pending_contributions[contribution_id]
return {
"item_id": item_id,
"status": "approved",
"message": "Contribution approved and added to knowledge base"
}
def get_user_contribution_score(self, user_id: str) -> Dict:
"""获取用户贡献评分"""
contributions = sum(1 for h in self.contribution_history
if h["user_id"] == user_id)
return {
"user_id": user_id,
"contribution_count": contributions,
"pending_count": sum(1 for c in self.pending_contributions.values()
if c["user_id"] == user_id),
"leaderboard_rank": self._calculate_leaderboard_rank(user_id)
}
def _calculate_leaderboard_rank(self, user_id: str) -> int:
"""计算排行榜排名"""
sorted_users = sorted(
self.user_contributions.items(),
key=lambda x: x[1],
reverse=True
)
for rank, (uid, _) in enumerate(sorted_users, 1):
if uid == user_id:
return rank
return len(sorted_users) + 1
14.4 MCP服务器集成
class KnowledgeBaseMCPServer:
"""知识库MCP服务器"""
def __init__(self, assistant: KnowledgeAssistant,
recommendation_engine: KnowledgeRecommendationEngine,
contribution_system: KnowledgeContributionSystem):
self.assistant = assistant
self.recommendations = recommendation_engine
self.contributions = contribution_system
def get_tools(self) -> List[Dict]:
"""定义工具"""
return [
{
"name": "search_knowledge",
"description": "搜索知识库",
"inputSchema": {
"type": "object",
"properties": {
"query": {"type": "string"},
"limit": {"type": "integer"}
},
"required": ["query"]
}
},
{
"name": "get_answer",
"description": "获取问题答案",
"inputSchema": {
"type": "object",
"properties": {
"question": {"type": "string"}
},
"required": ["question"]
}
},
{
"name": "submit_contribution",
"description": "提交知识贡献",
"inputSchema": {
"type": "object",
"properties": {
"user_id": {"type": "string"},
"title": {"type": "string"},
"content": {"type": "string"},
"category": {"type": "string"},
"tags": {"type": "array", "items": {"type": "string"}}
},
"required": ["user_id", "title", "content", "category"]
}
}
]
async def call_tool(self, tool_name: str, arguments: Dict) -> str:
"""调用工具"""
import json
try:
if tool_name == "search_knowledge":
results = await self.assistant.search_engine.full_text_search(
arguments["query"],
limit=arguments.get("limit", 10)
)
return json.dumps(results, ensure_ascii=False)
elif tool_name == "get_answer":
result = await self.assistant.answer_question(arguments["question"])
return json.dumps(result, ensure_ascii=False)
elif tool_name == "submit_contribution":
result = await self.contributions.submit_contribution(
arguments["user_id"],
arguments["content"],
arguments["title"],
arguments["category"],
arguments.get("tags", [])
)
return json.dumps(result, ensure_ascii=False)
else:
return json.dumps({"error": f"Unknown tool: {tool_name}"})
except Exception as e:
return json.dumps({"error": str(e)})
14.5 完整使用示例
async def main():
"""完整使用示例"""
# 初始化知识库
knowledge_graph = KnowledgeGraph()
assistant = KnowledgeAssistant(knowledge_graph)
recommendation_engine = KnowledgeRecommendationEngine(knowledge_graph)
contribution_system = KnowledgeContributionSystem(knowledge_graph)
# 添加示例知识
items = [
KnowledgeItem(
id="kb_001",
title="如何使用MCP?",
content="MCP是模型上下文协议...",
content_type=ContentType.WIKI,
tags=["MCP", "教程"],
created_at=datetime.now(),
updated_at=datetime.now(),
author="admin",
category="技术"
),
KnowledgeItem(
id="kb_002",
title="MCP最佳实践",
content="遵循这些最佳实践...",
content_type=ContentType.BEST_PRACTICE,
tags=["MCP", "实践"],
created_at=datetime.now(),
updated_at=datetime.now(),
author="admin",
category="技术"
)
]
for item in items:
knowledge_graph.add_item(item)
# 1. 搜索知识
print("🔍 Searching knowledge...")
results = await assistant.search_engine.full_text_search("MCP教程")
print(f"Found {len(results)} results")
# 2. 记录用户交互
print("👤 Recording user interactions...")
recommendation_engine.record_interaction("user_001", "kb_001", "view")
recommendation_engine.record_interaction("user_001", "kb_001", "like")
# 3. 获取推荐
print("💡 Getting recommendations...")
recommendations = await recommendation_engine.get_personalized_recommendations("user_001")
print(f"Recommendations: {recommendations}")
# 4. 提交贡献
print("📝 Submitting contribution...")
contribution = await contribution_system.submit_contribution(
"user_002",
"这是新的MCP教程内容...",
"高级MCP技巧",
"技术",
["MCP", "高级"]
)
print(f"Contribution ID: {contribution['contribution_id']}")
# 5. 获取用户得分
print("🏆 User contribution score...")
score = contribution_system.get_user_contribution_score("user_002")
print(json.dumps(score, indent=2))
14.6 知识库监控与统计
class KnowledgeBaseMetrics:
"""知识库指标"""
def __init__(self, knowledge_graph: KnowledgeGraph):
self.graph = knowledge_graph
self.search_queries: List[Dict] = []
self.user_sessions: Dict[str, Dict] = {}
def record_search(self, query: str, results_count: int,
user_id: str = None):
"""记录搜索"""
self.search_queries.append({
"query": query,
"results_count": results_count,
"user_id": user_id,
"timestamp": datetime.now().isoformat()
})
def get_popular_searches(self, limit: int = 10) -> List[Dict]:
"""获取热门搜索"""
from collections import Counter
queries = [q["query"] for q in self.search_queries]
counter = Counter(queries)
return [
{"query": q, "count": c}
for q, c in counter.most_common(limit)
]
def get_knowledge_health(self) -> Dict:
"""获取知识库健康指标"""
stats = self.graph.get_statistics()
return {
"total_items": stats["total_items"],
"total_tags": stats["total_tags"],
"categories": stats["total_categories"],
"avg_views_per_item": sum(item.view_count for item in self.graph.nodes.values()) / max(stats["total_items"], 1),
"most_popular_items": stats["most_viewed"],
"most_helpful_items": stats["most_helpful"],
"health_score": self._calculate_health_score(stats)
}
def _calculate_health_score(self, stats: Dict) -> float:
"""计算健康评分"""
score = 0
if stats["total_items"] > 100:
score += 30
elif stats["total_items"] > 50:
score += 20
if stats["total_tags"] > 50:
score += 20
avg_views = sum(item.view_count for item in self.graph.nodes.values()) / max(stats["total_items"], 1)
if avg_views > 10:
score += 25
return min(score, 100.0)
本章总结
| 关键点 | 说明 |
|---|---|
| 知识组织 | 知识图谱和标签系统 |
| 智能搜索 | 全文搜索和相关性排序 |
| 知识推荐 | 基于内容和标签的推荐 |
| 新员工培训 | 角色-课程匹配 |
| 论文管理 | 引用分析和相似度计算 |
| 影响力评估 | 引用计数和影响力等级 |
常见问题
Q1: 如何处理知识库中的过期信息? A: 标记更新日期,定期审查,添加过期标记或重定向到新版本。
Q2: 如何提高搜索准确度? A: 使用机器学习排序、用户反馈优化、知识标准化。
Q3: 如何管理知识库访问权限? A: 基于角色的访问控制(RBAC),分类权限管理。
Q4: 如何处理知识冲突? A: 版本控制、审批流程、冲突标记和解决机制。
Q5: 如何评估知识库价值? A: 使用频率、用户满意度、问题解决率等指标。
下一章预告:第15章将讲述监控、告警与运维MCP应用!