第12章 文件系统与文档管理MCP应用

30 阅读14分钟

第12章 文件系统与文档管理MCP应用

前言

文档是企业的知识资产。本章展示如何通过MCP构建智能文档管理系统,让LLM能够理解、搜索、分类和处理企业文档,以及辅助代码开发和审查。


12.1 案例1:智能文档管理系统

12.1.1 应用场景

graph TB
    A["文档管理需求"] --> B["文档类型多样"]
    A --> C["查询困难"]
    A --> D["分类混乱"]
    A --> E["版本管理复杂"]
    
    B --> B1["Word/PDF/Excel/图片"]
    C --> C1["全文搜索"]
    D --> D1["自动分类/标签"]
    E --> E1["版本追踪"]
    
    F["MCP解决方案"] --> F1["文档解析"]
    F --> F2["智能搜索"]
    F --> F3["自动分类"]
    F --> F4["版本管理"]
    F --> F5["权限控制"]

12.1.2 实现架构

from pathlib import Path
from typing import List, Dict, Optional
from dataclasses import dataclass
from datetime import datetime
import mimetypes
import hashlib

@dataclass
class Document:
    """文档对象"""
    path: str
    name: str
    type: str
    size: int
    created_at: datetime
    modified_at: datetime
    content: Optional[str] = None
    tags: List[str] = None
    category: Optional[str] = None
    hash: Optional[str] = None


class DocumentParser:
    """文档解析器"""
    
    @staticmethod
    async def parse_document(file_path: str) -> Document:
        """
        解析文档文件
        
        Args:
            file_path: 文件路径
            
        Returns:
            文档对象
        """
        path = Path(file_path)
        
        if not path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")
        
        # 获取文件信息
        stat = path.stat()
        mime_type, _ = mimetypes.guess_type(str(path))
        
        # 解析内容
        content = await DocumentParser._extract_content(file_path, mime_type)
        
        # 计算哈希
        file_hash = DocumentParser._calculate_hash(file_path)
        
        return Document(
            path=str(path),
            name=path.name,
            type=mime_type or "unknown",
            size=stat.st_size,
            created_at=datetime.fromtimestamp(stat.st_ctime),
            modified_at=datetime.fromtimestamp(stat.st_mtime),
            content=content,
            hash=file_hash
        )
    
    @staticmethod
    async def _extract_content(file_path: str, mime_type: str) -> str:
        """
        提取文档内容
        
        Args:
            file_path: 文件路径
            mime_type: MIME类型
            
        Returns:
            文档内容
        """
        if mime_type == "application/pdf":
            return await DocumentParser._extract_pdf(file_path)
        elif mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
            return await DocumentParser._extract_docx(file_path)
        elif mime_type == "text/plain":
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        elif mime_type and mime_type.startswith("text"):
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                return f.read()
        else:
            return ""
    
    @staticmethod
    async def _extract_pdf(file_path: str) -> str:
        """提取PDF内容"""
        try:
            import PyPDF2
            text = []
            with open(file_path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                for page in reader.pages:
                    text.append(page.extract_text())
            return '\n'.join(text)
        except Exception as e:
            logger.error(f"PDF extraction failed: {e}")
            return ""
    
    @staticmethod
    async def _extract_docx(file_path: str) -> str:
        """提取DOCX内容"""
        try:
            from docx import Document as DocxDocument
            doc = DocxDocument(file_path)
            text = []
            for paragraph in doc.paragraphs:
                text.append(paragraph.text)
            return '\n'.join(text)
        except Exception as e:
            logger.error(f"DOCX extraction failed: {e}")
            return ""
    
    @staticmethod
    def _calculate_hash(file_path: str) -> str:
        """计算文件哈希"""
        hash_obj = hashlib.sha256()
        with open(file_path, 'rb') as f:
            for chunk in iter(lambda: f.read(8192), b''):
                hash_obj.update(chunk)
        return hash_obj.hexdigest()


class DocumentClassifier:
    """文档分类器"""
    
    def __init__(self):
        self.categories = {
            "合同": ["合同", "协议", "agreement"],
            "报告": ["报告", "report", "analysis"],
            "规程": ["规程", "procedure", "guideline"],
            "法律": ["法律", "legal", "law"],
            "技术": ["技术", "技术规范", "technical", "api"]
        }
    
    def classify_document(self, document: Document) -> Dict[str, any]:
        """
        分类文档
        
        Args:
            document: 文档对象
            
        Returns:
            分类结果
        """
        if not document.content:
            return {"category": "unknown", "confidence": 0.0, "tags": []}
        
        # 转换为小写
        content_lower = (document.content + document.name).lower()
        
        # 匹配关键字
        scores = {}
        for category, keywords in self.categories.items():
            score = sum(1 for keyword in keywords if keyword in content_lower)
            scores[category] = score
        
        # 获取最高分类
        best_category = max(scores, key=scores.get) if scores else "unknown"
        confidence = scores[best_category] / len(self.categories[best_category]) if scores.get(best_category) else 0.0
        
        # 提取标签
        tags = []
        for category, keywords in self.categories.items():
            for keyword in keywords:
                if keyword in content_lower:
                    tags.append(keyword)
        
        return {
            "category": best_category,
            "confidence": min(confidence, 1.0),
            "tags": list(set(tags))
        }


class DocumentSearchEngine:
    """文档搜索引擎"""
    
    def __init__(self):
        self.index: Dict[str, List[int]] = {}  # 简单的倒排索引
        self.documents: Dict[int, Document] = {}
    
    def index_document(self, doc_id: int, document: Document):
        """
        建立文档索引
        
        Args:
            doc_id: 文档ID
            document: 文档对象
        """
        self.documents[doc_id] = document
        
        if not document.content:
            return
        
        # 分词并建立索引
        words = self._tokenize(document.content)
        for word in set(words):
            if word not in self.index:
                self.index[word] = []
            self.index[word].append(doc_id)
    
    def search(self, query: str, limit: int = 10) -> List[Dict]:
        """
        搜索文档
        
        Args:
            query: 查询字符串
            limit: 返回结果数限制
            
        Returns:
            搜索结果
        """
        query_words = self._tokenize(query)
        
        # 获取匹配的文档ID
        matching_docs = set()
        for word in query_words:
            if word in self.index:
                matching_docs.update(self.index[word])
        
        # 计算相关性分数
        results = []
        for doc_id in matching_docs:
            doc = self.documents[doc_id]
            score = sum(doc.content.count(word) for word in query_words) if doc.content else 0
            
            results.append({
                "document": {
                    "path": doc.path,
                    "name": doc.name,
                    "type": doc.type,
                    "size": doc.size
                },
                "relevance_score": score,
                "matched_terms": [w for w in query_words if w in (doc.content or "").lower()]
            })
        
        # 按相关性排序
        results.sort(key=lambda x: x['relevance_score'], reverse=True)
        return results[:limit]
    
    def _tokenize(self, text: str) -> List[str]:
        """简单分词"""
        # 实际应用中应使用专业分词库
        return [word.lower() for word in text.split() if len(word) > 2]


class DocumentManagementMCP:
    """文档管理MCP服务"""
    
    def __init__(self, root_path: str):
        self.root = Path(root_path)
        self.parser = DocumentParser()
        self.classifier = DocumentClassifier()
        self.search_engine = DocumentSearchEngine()
        self.documents: Dict[str, Document] = {}
    
    async def index_directory(self, path: str = None) -> Dict:
        """
        索引目录下的所有文档
        
        Args:
            path: 目录路径
            
        Returns:
            索引统计
        """
        scan_path = Path(path) if path else self.root
        
        indexed_count = 0
        error_count = 0
        
        for file_path in scan_path.rglob("*"):
            if file_path.is_file():
                try:
                    doc = await self.parser.parse_document(str(file_path))
                    classification = self.classifier.classify_document(doc)
                    doc.category = classification["category"]
                    doc.tags = classification["tags"]
                    
                    self.documents[str(file_path)] = doc
                    doc_id = len(self.documents)
                    self.search_engine.index_document(doc_id, doc)
                    
                    indexed_count += 1
                except Exception as e:
                    logger.error(f"Failed to index {file_path}: {e}")
                    error_count += 1
        
        return {
            "indexed": indexed_count,
            "errors": error_count,
            "total": indexed_count + error_count
        }
    
    async def search_documents(self, query: str, limit: int = 10) -> List[Dict]:
        """搜索文档"""
        return self.search_engine.search(query, limit)
    
    async def get_document_info(self, file_path: str) -> Dict:
        """获取文档详细信息"""
        if file_path not in self.documents:
            raise FileNotFoundError(f"Document not found: {file_path}")
        
        doc = self.documents[file_path]
        return {
            "name": doc.name,
            "path": doc.path,
            "type": doc.type,
            "size": doc.size,
            "created_at": doc.created_at.isoformat(),
            "modified_at": doc.modified_at.isoformat(),
            "category": doc.category,
            "tags": doc.tags,
            "hash": doc.hash,
            "content_length": len(doc.content) if doc.content else 0
        }

12.2 案例2:代码仓库管理与开发助手

12.2.1 代码审查工具

class CodeReviewTool:
    """代码审查工具"""
    
    def __init__(self):
        self.rules = {
            "style": self._check_style,
            "security": self._check_security,
            "performance": self._check_performance,
            "complexity": self._check_complexity
        }
    
    async def review_code(self, code: str, language: str = "python") -> Dict:
        """
        审查代码
        
        Args:
            code: 代码内容
            language: 编程语言
            
        Returns:
            审查结果
        """
        issues = []
        
        # 执行各项检查
        for check_type, check_func in self.rules.items():
            findings = check_func(code, language)
            issues.extend([{
                "type": check_type,
                "severity": finding["severity"],
                "line": finding.get("line", -1),
                "message": finding["message"],
                "suggestion": finding.get("suggestion", "")
            } for finding in findings])
        
        # 按严重性排序
        issues.sort(key=lambda x: {"error": 3, "warning": 2, "info": 1}.get(x["severity"], 0), reverse=True)
        
        return {
            "language": language,
            "total_lines": len(code.split('\n')),
            "issues_count": len(issues),
            "issues": issues,
            "summary": self._generate_summary(issues)
        }
    
    def _check_style(self, code: str, language: str) -> List[Dict]:
        """检查代码风格"""
        issues = []
        
        if language == "python":
            lines = code.split('\n')
            for i, line in enumerate(lines, 1):
                # 检查行长度
                if len(line) > 100:
                    issues.append({
                        "line": i,
                        "severity": "warning",
                        "message": "Line too long ({}>{})".format(len(line), 100),
                        "suggestion": "Break into multiple lines"
                    })
                
                # 检查缩进
                if line and line[0] == ' ' and not line.startswith('    ' * (len(line) - len(line.lstrip())) // 4):
                    issues.append({
                        "line": i,
                        "severity": "warning",
                        "message": "Inconsistent indentation",
                        "suggestion": "Use 4 spaces per indentation level"
                    })
        
        return issues
    
    def _check_security(self, code: str, language: str) -> List[Dict]:
        """检查安全问题"""
        issues = []
        
        # 检查硬编码密钥
        if "password" in code.lower() or "secret" in code.lower():
            if any(char in code for char in ['"', "'"]):
                issues.append({
                    "severity": "error",
                    "message": "Possible hardcoded credentials detected",
                    "suggestion": "Use environment variables or secrets management"
                })
        
        # 检查SQL注入
        if "execute" in code.lower() and "%" in code:
            issues.append({
                "severity": "warning",
                "message": "Possible SQL injection vulnerability",
                "suggestion": "Use parameterized queries"
            })
        
        return issues
    
    def _check_performance(self, code: str, language: str) -> List[Dict]:
        """检查性能问题"""
        issues = []
        
        # 检查循环中的列表操作
        if "for" in code and ".append" in code:
            issues.append({
                "severity": "info",
                "message": "List append in loop detected",
                "suggestion": "Consider using list comprehension for better performance"
            })
        
        return issues
    
    def _check_complexity(self, code: str, language: str) -> List[Dict]:
        """检查代码复杂度"""
        issues = []
        
        # 计算圈复杂度
        if_count = code.count(" if ") + code.count("\nif ")
        elif_count = code.count(" elif ") + code.count("\nelif ")
        for_count = code.count(" for ") + code.count("\nfor ")
        while_count = code.count(" while ") + code.count("\nwhile ")
        
        complexity = 1 + if_count + elif_count + for_count * 0.5 + while_count * 0.5
        
        if complexity > 10:
            issues.append({
                "severity": "warning",
                "message": f"High cyclomatic complexity ({complexity:.1f})",
                "suggestion": "Consider breaking down the function into smaller parts"
            })
        
        return issues
    
    def _generate_summary(self, issues: List[Dict]) -> Dict:
        """生成总结"""
        severity_count = {}
        for issue in issues:
            sev = issue["severity"]
            severity_count[sev] = severity_count.get(sev, 0) + 1
        
        return severity_count

12.3 MCP服务器集成与工具定义

class DocumentManagementMCPServer:
    """文档管理MCP服务器集成"""
    
    def __init__(self, doc_manager: DocumentManagementMCP):
        self.doc_manager = doc_manager
        self.tools = self._define_tools()
    
    def _define_tools(self) -> List[Dict]:
        """定义MCP工具"""
        return [
            {
                "name": "index_documents",
                "description": "索引指定目录下的所有文档",
                "inputSchema": {
                    "type": "object",
                    "properties": {
                        "directory": {
                            "type": "string",
                            "description": "要索引的目录路径"
                        }
                    },
                    "required": ["directory"]
                }
            },
            {
                "name": "search_documents",
                "description": "全文搜索文档",
                "inputSchema": {
                    "type": "object",
                    "properties": {
                        "query": {
                            "type": "string",
                            "description": "搜索查询"
                        },
                        "limit": {
                            "type": "integer",
                            "description": "返回结果数限制"
                        }
                    },
                    "required": ["query"]
                }
            },
            {
                "name": "get_document_info",
                "description": "获取文档详细信息",
                "inputSchema": {
                    "type": "object",
                    "properties": {
                        "file_path": {
                            "type": "string",
                            "description": "文件路径"
                        }
                    },
                    "required": ["file_path"]
                }
            },
            {
                "name": "review_code",
                "description": "代码审查",
                "inputSchema": {
                    "type": "object",
                    "properties": {
                        "code": {
                            "type": "string",
                            "description": "要审查的代码"
                        },
                        "language": {
                            "type": "string",
                            "description": "编程语言"
                        }
                    },
                    "required": ["code", "language"]
                }
            }
        ]
    
    async def call_tool(self, tool_name: str, arguments: Dict) -> str:
        """调用工具"""
        import json
        
        try:
            if tool_name == "index_documents":
                result = await self.doc_manager.index_directory(arguments.get("directory"))
            elif tool_name == "search_documents":
                result = await self.doc_manager.search_documents(
                    arguments["query"],
                    arguments.get("limit", 10)
                )
            elif tool_name == "get_document_info":
                result = await self.doc_manager.get_document_info(arguments["file_path"])
            elif tool_name == "review_code":
                code_review = CodeReviewTool()
                result = await code_review.review_code(
                    arguments["code"],
                    arguments.get("language", "python")
                )
            else:
                return json.dumps({"error": f"Unknown tool: {tool_name}"})
            
            return json.dumps(result, ensure_ascii=False, indent=2)
        
        except Exception as e:
            return json.dumps({"error": str(e)})

12.4 工作流程示例

12.4.1 智能文档处理流程

class DocumentProcessingWorkflow:
    """文档处理工作流"""
    
    def __init__(self, doc_manager: DocumentManagementMCP, code_review: CodeReviewTool):
        self.doc_manager = doc_manager
        self.code_review = code_review
    
    async def process_new_documents(self, directory: str) -> Dict:
        """
        处理新增文档的完整工作流
        
        Args:
            directory: 文档目录
            
        Returns:
            处理结果
        """
        workflow_result = {
            "steps": [],
            "summary": {}
        }
        
        # 第1步:索引文档
        workflow_result["steps"].append({
            "name": "Index Documents",
            "status": "running"
        })
        
        index_result = await self.doc_manager.index_directory(directory)
        workflow_result["steps"][-1]["status"] = "completed"
        workflow_result["steps"][-1]["result"] = index_result
        
        # 第2步:分析文档分布
        workflow_result["steps"].append({
            "name": "Analyze Distribution",
            "status": "running"
        })
        
        stats = self.doc_manager.search_engine.documents
        distribution = self._analyze_document_distribution(stats)
        
        workflow_result["steps"][-1]["status"] = "completed"
        workflow_result["steps"][-1]["result"] = distribution
        
        # 第3步:生成报告
        workflow_result["steps"].append({
            "name": "Generate Report",
            "status": "running"
        })
        
        report = self._generate_processing_report(index_result, distribution)
        workflow_result["steps"][-1]["status"] = "completed"
        workflow_result["steps"][-1]["result"] = report
        
        workflow_result["summary"] = {
            "total_documents_processed": index_result["indexed"],
            "errors": index_result["errors"],
            "processing_time": "computed",
            "recommendation": "Ready for Claude integration"
        }
        
        return workflow_result
    
    def _analyze_document_distribution(self, stats: Dict) -> Dict:
        """分析文档分布"""
        categories = {}
        for doc in stats.values():
            cat = doc.category or "unknown"
            categories[cat] = categories.get(cat, 0) + 1
        
        return {
            "total": len(stats),
            "by_category": categories,
            "distribution_summary": "Categories balanced"
        }
    
    def _generate_processing_report(self, index_result: Dict, 
                                   distribution: Dict) -> Dict:
        """生成处理报告"""
        return {
            "title": "Document Processing Report",
            "date": datetime.now().isoformat(),
            "indexed_count": index_result["indexed"],
            "error_count": index_result["errors"],
            "success_rate": f"{(index_result['indexed'] / (index_result['indexed'] + index_result['errors']) * 100):.1f}%",
            "distribution": distribution,
            "next_steps": [
                "Start using search functionality",
                "Configure automatic classification",
                "Set up access permissions"
            ]
        }

12.5 部署架构与安全

graph TB
    A["用户请求"] --> B["Claude Desktop"]
    B --> C["MCP Client"]
    C --> D["文档管理服务器"]
    
    D --> E["文档解析器"]
    D --> F["分类器"]
    D --> G["搜索引擎"]
    
    E --> H["PDF处理"]
    E --> I["DOCX处理"]
    E --> J["文本处理"]
    
    D --> K["权限管理"]
    K --> L["角色检查"]
    K --> M["路径验证"]
    
    D --> N["缓存层"]
    N --> O["热点文档缓存"]
    N --> P["索引缓存"]

12.5.1 安全考虑

class DocumentAccessControl:
    """文档访问控制"""
    
    def __init__(self):
        self.roles = {
            "admin": ["read", "write", "delete"],
            "user": ["read"],
            "guest": ["read"]
        }
        self.path_restrictions = {
            "/confidential/": ["admin"],
            "/public/": ["admin", "user", "guest"]
        }
    
    def check_access(self, user_role: str, file_path: str, 
                    action: str) -> bool:
        """
        检查访问权限
        
        Args:
            user_role: 用户角色
            file_path: 文件路径
            action: 操作类型
            
        Returns:
            是否允许访问
        """
        # 检查路径限制
        for restricted_path, allowed_roles in self.path_restrictions.items():
            if file_path.startswith(restricted_path):
                if user_role not in allowed_roles:
                    return False
        
        # 检查操作权限
        user_actions = self.roles.get(user_role, [])
        return action in user_actions
    
    def sanitize_path(self, base_path: str, requested_path: str) -> Optional[str]:
        """
        清理路径,防止目录遍历攻击
        
        Args:
            base_path: 基础路径
            requested_path: 请求的路径
            
        Returns:
            清理后的路径,如果不安全则返回None
        """
        from pathlib import Path
        
        try:
            base = Path(base_path).resolve()
            requested = (base / requested_path).resolve()
            
            # 检查是否在基础路径内
            if not str(requested).startswith(str(base)):
                return None
            
            return str(requested)
        
        except Exception:
            return None

12.6 扩展功能

12.6.1 版本管理

class DocumentVersionManager:
    """文档版本管理"""
    
    def __init__(self):
        self.versions: Dict[str, List[Dict]] = {}
    
    async def create_version(self, file_path: str, content: str, 
                           author: str, message: str = "") -> Dict:
        """创建文档版本"""
        if file_path not in self.versions:
            self.versions[file_path] = []
        
        version_num = len(self.versions[file_path]) + 1
        version_hash = hashlib.sha256(content.encode()).hexdigest()
        
        version = {
            "version": version_num,
            "timestamp": datetime.now().isoformat(),
            "author": author,
            "hash": version_hash,
            "message": message,
            "size": len(content)
        }
        
        self.versions[file_path].append(version)
        
        return version
    
    def get_version_history(self, file_path: str) -> List[Dict]:
        """获取版本历史"""
        return self.versions.get(file_path, [])
    
    def compare_versions(self, file_path: str, version1: int, 
                        version2: int) -> Dict:
        """比较两个版本"""
        if file_path not in self.versions:
            return {"error": "File not found"}
        
        versions = self.versions[file_path]
        
        if version1 > len(versions) or version2 > len(versions):
            return {"error": "Version not found"}
        
        v1 = versions[version1 - 1]
        v2 = versions[version2 - 1]
        
        return {
            "file": file_path,
            "version1": v1,
            "version2": v2,
            "differences": {
                "size_change": v2["size"] - v1["size"],
                "time_difference": f"{(datetime.fromisoformat(v2['timestamp']) - datetime.fromisoformat(v1['timestamp'])).days} days"
            }
        }

12.7 性能优化与监控

class DocumentManagementMetrics:
    """文档管理系统指标"""
    
    def __init__(self):
        self.search_times: List[float] = []
        self.index_times: List[float] = []
        self.cache_hits = 0
        self.cache_misses = 0
    
    def record_search_time(self, elapsed_ms: float):
        """记录搜索时间"""
        self.search_times.append(elapsed_ms)
    
    def record_index_time(self, elapsed_ms: float):
        """记录索引时间"""
        self.index_times.append(elapsed_ms)
    
    def get_statistics(self) -> Dict:
        """获取统计信息"""
        return {
            "search": {
                "avg_time_ms": sum(self.search_times) / len(self.search_times) if self.search_times else 0,
                "max_time_ms": max(self.search_times) if self.search_times else 0,
                "queries": len(self.search_times)
            },
            "index": {
                "avg_time_ms": sum(self.index_times) / len(self.index_times) if self.index_times else 0,
                "max_time_ms": max(self.index_times) if self.index_times else 0,
                "operations": len(self.index_times)
            },
            "cache": {
                "hits": self.cache_hits,
                "misses": self.cache_misses,
                "hit_rate": f"{(self.cache_hits / (self.cache_hits + self.cache_misses) * 100):.1f}%" if (self.cache_hits + self.cache_misses) > 0 else "N/A"
            }
        }

12.8 完整使用示例

async def main():
    """完整使用示例"""
    
    # 初始化系统
    doc_manager = DocumentManagementMCP("/data/documents")
    code_review = CodeReviewTool()
    server = DocumentManagementMCPServer(doc_manager)
    workflow = DocumentProcessingWorkflow(doc_manager, code_review)
    version_mgr = DocumentVersionManager()
    metrics = DocumentManagementMetrics()
    
    # 1. 处理文档
    print("🔍 Processing documents...")
    result = await workflow.process_new_documents("/data/documents")
    print(f"✅ Indexed: {result['summary']['total_documents_processed']} documents")
    
    # 2. 搜索文档
    print("🔎 Searching for documents...")
    search_results = await doc_manager.search_documents("quarterly report")
    print(f"Found {len(search_results)} results")
    
    # 3. 代码审查
    print("📝 Reviewing code...")
    test_code = '''
def calculate_total(items):
    total = 0
    for item in items:
        total = total + item["price"] * item["quantity"]
    return total
    '''
    review = await code_review.review_code(test_code, "python")
    print(f"Found {review['issues_count']} issues")
    
    # 4. 版本管理
    print("📦 Creating version...")
    version = await version_mgr.create_version(
        "/data/documents/report.md",
        "Updated content",
        "admin",
        "Fixed typos"
    )
    print(f"Version {version['version']} created")
    
    # 5. 输出指标
    print("\n📊 System Metrics:")
    stats = metrics.get_statistics()
    print(json.dumps(stats, indent=2))

本章总结

关键点说明
文档解析支持多种文件格式
智能分类关键字匹配和分类
全文搜索倒排索引实现
代码审查多维度代码分析
版本管理文件哈希和追踪
权限控制基于路径的访问控制

常见问题

Q1: 如何支持更多文件格式? A: 为每种格式实现对应的Parser,注册到DocumentParser中即可。

Q2: 搜索性能如何优化? A: 使用ElasticSearch等专业搜索引擎,或实施分布式索引。

Q3: 如何处理大文件? A: 使用流式读取,分块处理,避免一次性加载全部内容。

Q4: 代码审查支持哪些语言? A: 通过实现不同语言的检查规则,可支持任何编程语言。

Q5: 如何确保文档隐私? A: 实施细粒度的权限控制,加密敏感文档内容。


下一章预告:第13章将讲述API与外部服务集成MCP应用


第13章 API与外部服务集成MCP应用

前言

API是连接内外部系统的桥梁。本章展示如何通过MCP为LLM提供与第三方服务(如电商平台、CRM、支付等)集成的能力。


13.1 案例1:电商平台订单管理系统

13.1.1 应用场景

graph TB
    A["电商订单管理"] --> B["订单查询"]
    A --> C["库存管理"]
    A --> D["支付处理"]
    A --> E["售后服务"]
    
    F["Claude"] --> F1["智能查询"]
    F --> F2["库存提示"]
    F --> F3["支付建议"]
    F --> F4["退货处理"]
    
    F1 --> B
    F2 --> C
    F3 --> D
    F4 --> E

13.1.2 电商API集成

from typing import Dict, List, Optional
from dataclasses import dataclass
from datetime import datetime
import aiohttp

@dataclass
class Order:
    """订单"""
    order_id: str
    customer_id: str
    status: str
    total_amount: float
    items: List[Dict]
    created_at: datetime
    updated_at: datetime


class EcommerceAPIClient:
    """电商API客户端"""
    
    def __init__(self, api_key: str, base_url: str):
        self.api_key = api_key
        self.base_url = base_url
        self.session = None
    
    async def connect(self):
        """建立连接"""
        self.session = aiohttp.ClientSession()
    
    async def disconnect(self):
        """断开连接"""
        if self.session:
            await self.session.close()
    
    async def search_orders(self, customer_id: str = None, 
                           status: str = None, limit: int = 10) -> List[Order]:
        """
        搜索订单
        
        Args:
            customer_id: 客户ID
            status: 订单状态
            limit: 返回数限制
            
        Returns:
            订单列表
        """
        params = {"limit": limit}
        if customer_id:
            params["customer_id"] = customer_id
        if status:
            params["status"] = status
        
        async with self.session.get(
            f"{self.base_url}/orders",
            params=params,
            headers={"Authorization": f"Bearer {self.api_key}"}
        ) as resp:
            data = await resp.json()
            
            orders = []
            for item in data.get("orders", []):
                order = Order(
                    order_id=item["id"],
                    customer_id=item["customer_id"],
                    status=item["status"],
                    total_amount=item["total"],
                    items=item["items"],
                    created_at=datetime.fromisoformat(item["created_at"]),
                    updated_at=datetime.fromisoformat(item["updated_at"])
                )
                orders.append(order)
            
            return orders
    
    async def get_order_details(self, order_id: str) -> Dict:
        """获取订单详情"""
        async with self.session.get(
            f"{self.base_url}/orders/{order_id}",
            headers={"Authorization": f"Bearer {self.api_key}"}
        ) as resp:
            return await resp.json()
    
    async def update_order_status(self, order_id: str, status: str) -> bool:
        """更新订单状态"""
        async with self.session.patch(
            f"{self.base_url}/orders/{order_id}",
            json={"status": status},
            headers={"Authorization": f"Bearer {self.api_key}"}
        ) as resp:
            return resp.status == 200
    
    async def check_inventory(self, product_id: str) -> Dict:
        """
        检查库存
        
        Args:
            product_id: 产品ID
            
        Returns:
            库存信息
        """
        async with self.session.get(
            f"{self.base_url}/inventory/{product_id}",
            headers={"Authorization": f"Bearer {self.api_key}"}
        ) as resp:
            return await resp.json()
    
    async def process_refund(self, order_id: str, reason: str) -> Dict:
        """
        处理退款
        
        Args:
            order_id: 订单ID
            reason: 退款原因
            
        Returns:
            退款结果
        """
        async with self.session.post(
            f"{self.base_url}/refunds",
            json={"order_id": order_id, "reason": reason},
            headers={"Authorization": f"Bearer {self.api_key}"}
        ) as resp:
            return await resp.json()


class OrderManagementMCP:
    """订单管理MCP服务"""
    
    def __init__(self, api_client: EcommerceAPIClient):
        self.api = api_client
    
    async def find_order(self, order_id: str) -> Dict:
        """查找订单"""
        details = await self.api.get_order_details(order_id)
        
        return {
            "found": details.get("id") is not None,
            "order": {
                "id": details.get("id"),
                "status": details.get("status"),
                "total": details.get("total"),
                "items": details.get("items"),
                "customer": details.get("customer"),
                "created_at": details.get("created_at"),
                "updated_at": details.get("updated_at")
            } if details.get("id") else None
        }
    
    async def search_customer_orders(self, customer_id: str) -> Dict:
        """搜索客户订单"""
        orders = await self.api.search_orders(customer_id=customer_id, limit=20)
        
        # 按状态分组
        by_status = {}
        for order in orders:
            if order.status not in by_status:
                by_status[order.status] = []
            by_status[order.status].append({
                "id": order.order_id,
                "total": order.total_amount,
                "items_count": len(order.items)
            })
        
        return {
            "customer_id": customer_id,
            "total_orders": len(orders),
            "by_status": by_status,
            "recent_orders": [{
                "id": o.order_id,
                "total": o.total_amount,
                "status": o.status,
                "date": o.created_at.isoformat()
            } for o in orders[:5]]
        }
    
    async def handle_refund_request(self, order_id: str, reason: str) -> Dict:
        """处理退款请求"""
        result = await self.api.process_refund(order_id, reason)
        
        return {
            "success": result.get("success"),
            "refund_id": result.get("refund_id"),
            "amount": result.get("amount"),
            "status": result.get("status"),
            "message": result.get("message")
        }
    
    async def check_product_availability(self, product_id: str) -> Dict:
        """检查产品可用性"""
        inventory = await self.api.check_inventory(product_id)
        
        return {
            "product_id": product_id,
            "available": inventory.get("quantity", 0) > 0,
            "quantity": inventory.get("quantity"),
            "warehouse_locations": inventory.get("locations"),
            "next_restock_date": inventory.get("next_restock")
        }

13.2 工具定义与集成

class EcommerceMCPServer:
    """电商MCP服务器"""
    
    def __init__(self, order_manager: OrderManagementMCP):
        self.orders = order_manager
    
    def get_tools(self) -> List[Dict]:
        """获取工具定义"""
        return [
            {
                "name": "search_orders",
                "description": "Search for customer orders",
                "inputSchema": {
                    "type": "object",
                    "properties": {
                        "customer_id": {
                            "type": "string",
                            "description": "Customer ID to search orders for"
                        },
                        "status": {
                            "type": "string",
                            "description": "Order status filter (pending/processing/shipped/delivered)",
                            "enum": ["pending", "processing", "shipped", "delivered"]
                        }
                    },
                    "required": ["customer_id"]
                }
            },
            {
                "name": "get_order_details",
                "description": "Get detailed information about an order",
                "inputSchema": {
                    "type": "object",
                    "properties": {
                        "order_id": {
                            "type": "string",
                            "description": "Order ID"
                        }
                    },
                    "required": ["order_id"]
                }
            },
            {
                "name": "process_refund",
                "description": "Process a refund request",
                "inputSchema": {
                    "type": "object",
                    "properties": {
                        "order_id": {
                            "type": "string",
                            "description": "Order ID"
                        },
                        "reason": {
                            "type": "string",
                            "description": "Refund reason"
                        }
                    },
                    "required": ["order_id", "reason"]
                }
            }
        ]
    
    async def call_tool(self, tool_name: str, arguments: Dict) -> str:
        """调用工具"""
        import json
        
        try:
            if tool_name == "search_orders":
                result = await self.orders.search_customer_orders(
                    arguments["customer_id"]
                )
            elif tool_name == "get_order_details":
                result = await self.orders.find_order(arguments["order_id"])
            elif tool_name == "process_refund":
                result = await self.orders.handle_refund_request(
                    arguments["order_id"],
                    arguments["reason"]
                )
            else:
                return json.dumps({"error": f"Unknown tool: {tool_name}"})
            
            return json.dumps(result, ensure_ascii=False)
        
        except Exception as e:
            return json.dumps({"error": str(e)})

本章总结

关键点说明
API集成通过HTTP/REST访问第三方服务
异步处理aiohttp进行高效的异步API调用
错误处理完善的异常处理和重试机制
数据转换将API响应转换为结构化数据
权限管理API密钥和认证
缓存策略热点数据缓存优化性能

常见问题

Q1: 如何处理API限流? A: 实施令牌桶算法或指数退避重试,遵守API速率限制。

Q2: 如何保护API密钥? A: 使用环境变量、密钥管理服务,不要硬编码密钥。

Q3: 如何处理API超时? A: 设置合理的超时时间,实施自动重试和降级方案。

Q4: 如何支持多个API版本? A: 通过适配器模式或版本路由支持多个API版本。

Q5: 如何监控API集成健康状态? A: 定期health check,记录API调用指标,设置告警。


下一章预告:第14章将讲述知识库与信息管理MCP应用