5天从零搭建RAG系统

4 阅读5分钟

一、安装milvus

pip install milvus

用镜像源下载快很多

# 清华源
pip install milvus -i https://pypi.tuna.tsinghua.edu.cn/simple

# 阿里云
pip install milvus -i https://mirrors.aliyun.com/pypi/simple/

# 豆瓣
pip install milvus -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com

测试连接脚本(保存为test_milvus.py)

from milvus import default_server
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

# 启动本地服务
default_server.start()

# 连接
connections.connect(host='127.0.0.1', port=default_server.listen_port)

# 创建collection测试
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=128)
]
schema = CollectionSchema(fields, "test")
collection = Collection("test_collection", schema)

print("Milvus Lite 跑通!Collection创建成功")

# 停止服务
default_server.stop()

执行:

python test_milvus.py

二、接入真实embedding模型

一、安装embedding依赖

pip install sentence-transformers -i https://pypi.tuna.tsinghua.edu.cn/simple

二、测试脚本(保存为test_embedding.py)

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # 强制用CPU

from sentence_transformers import SentenceTransformer

# 加载模型(用CPU,慢但稳定)
model = SentenceTransformer('BAAI/bge-large-zh', device='cpu')

# 测试
text = "测试文本"
vector = model.encode(text)
print(f"向量维度: {len(vector)}")
print(f"前5个值: {vector[:5]}")
print("CPU模式跑通!")

三、把embedding和Milvus连起来——文本→向量→存入Milvus→搜索返回

一、测试脚本(保存为test_embedding.py)

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

from milvus import default_server
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection
from sentence_transformers import SentenceTransformer

# 1. 启动Milvus
default_server.start()
connections.connect(host='127.0.0.1', port=default_server.listen_port)

# 2. 创建collection(维度1024,匹配BGE模型)
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1024)
]
schema = CollectionSchema(fields, "doc_collection")
collection = Collection("docs", schema)

# 3. 加载模型
model = SentenceTransformer('BAAI/bge-large-zh', device='cpu')

# 4. 插入数据
texts = ["这是第一段测试文本", "这是第二段关于财务的内容"]
vectors = [model.encode(t) for t in texts]
entities = [texts, vectors]
collection.insert(entities)
collection.flush()
print(f"插入 {len(texts)} 条数据")

# 5. 创建索引并加载
index_params = {"metric_type": "L2", "index_type": "FLAT", "params": {}}
collection.create_index("vector", index_params)
collection.load()

# 6. 搜索测试
query = "财务相关内容"
query_vec = model.encode(query)
results = collection.search([query_vec], "vector", param={"metric_type": "L2"}, limit=2)

print(f"查询: {query}")
for hits in results:
    for hit in hits:
        print(f"  找到: id={hit.id}, distance={hit.distance}")

# 7. 清理
default_server.stop()
print("RAG流程跑通!")

四、真实文档流程

创建 docs 文件夹,放入任意文本: mkdir -p ~/milvus/docs

创建3个测试文档(复制粘贴任意内容)

echo "财务报销流程:员工提交发票→部门审批→财务审核→打款。注意事项:发票抬头需为公司全称,金额超过1000元需附明细清单。" > ~/milvus/docs/报销.txt

echo "2024年税收政策:小微企业增值税起征点提高至月销售额10万元,所得税减免延续至2027年底。申报方式:电子税务局在线提交。" > ~/milvus/docs/税务.txt

echo "固定资产管理办法:单价超过5000元的设备需登记台账,每年盘点一次,报废需经三层审批。" > ~/milvus/docs/资产.txt

day4_real_docs.py

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

from milvus import default_server
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection
from sentence_transformers import SentenceTransformer
import glob

# 1. 启动Milvus
default_server.start()
connections.connect(host='127.0.0.1', port=default_server.listen_port)

# 2. 创建collection
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="filename", dtype=DataType.VARCHAR, max_length=100),
    FieldSchema(name="chunk", dtype=DataType.VARCHAR, max_length=1000),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1024)
]
schema = CollectionSchema(fields, "doc_chunks")
collection = Collection("real_docs", schema)

# 3. 加载模型
model = SentenceTransformer('BAAI/bge-large-zh', device='cpu')

# 4. 读取文档并切分(简单按段落切分)
docs_path = "/home/ubuntu/milvus/docs/*.txt"
all_chunks = []

for filepath in glob.glob(docs_path):
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        # 简单切分:每100字一段
        for i in range(0, len(content), 100):
            chunk = content[i:i+100]
            if len(chunk) > 20:  # 过滤太短
                all_chunks.append({
                    'filename': os.path.basename(filepath),
                    'text': chunk
                })

print(f"共切分 {len(all_chunks)} 段")

# 5. 向量化并插入
batch_size = 4  # CPU慢,小批量
for i in range(0, len(all_chunks), batch_size):
    batch = all_chunks[i:i+batch_size]
    texts = [item['text'] for item in batch]
    filenames = [item['filename'] for item in batch]
    vectors = [model.encode(t) for t in texts]
    
    entities = [filenames, texts, vectors]
    collection.insert(entities)
    print(f"插入 {i+len(batch)}/{len(all_chunks)}")

collection.flush()
print(f"总计插入 {len(all_chunks)} 段")

# 6. 创建索引
index_params = {"metric_type": "L2", "index_type": "FLAT", "params": {}}
collection.create_index("vector", index_params)
collection.load()

# 7. 搜索测试
# query = "怎么报销发票"
# query_vec = model.encode(query)
# results = collection.search([query_vec], "vector", param={"metric_type": "L2"}, limit=3)

# print(f"\n查询: {query}")
# for hits in results:
#     for hit in hits:
#         print(f"  来自 {hit.entity.get('filename')}: {hit.entity.get('chunk')[:50]}... (距离: {hit.distance:.3f})")

# default_server.stop()
# print("\nDay 4 完成!真实文档流程跑通")

# 7. 搜索测试
query = "怎么报销发票"
query_vec = model.encode(query)
results = collection.search([query_vec], "vector", param={"metric_type": "L2"}, limit=3, output_fields=["filename", "chunk"])

print(f"\n查询: {query}")
for hits in results:
    for hit in hits:
        filename = hit.entity.get('filename') if hit.entity else "unknown"
        chunk = hit.entity.get('chunk') if hit.entity else "unknown"
        print(f"  来自 {filename}: {chunk[:50] if chunk else 'N/A'}... (距离: {hit.distance:.3f})")

default_server.stop()
print("\nDay 4 完成!真实文档流程跑通")

image.png

五、接入DeepSeek API,实现"问问题→检索→LLM生成回答"

export DEEPSEEK_API_KEY="sk-你的key"

day5_rag_final.py

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

from milvus import default_server
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
from sentence_transformers import SentenceTransformer
from openai import OpenAI
import glob

# 1. 启动Milvus
default_server.start()
connections.connect(host='127.0.0.1', port=default_server.listen_port)

# 2. 连接DeepSeek
client = OpenAI(
    api_key=os.getenv("DEEPSEEK_API_KEY"),
    base_url="https://api.deepseek.com"
)

# 3. 加载模型
model = SentenceTransformer('BAAI/bge-large-zh', device='cpu')

# 4. 检查并创建collection(如果不存在)
collection_name = "real_docs_v2"
if utility.has_collection(collection_name):
    collection = Collection(collection_name)
    print(f"复用已有collection: {collection_name}")
else:
    # 创建新collection
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="filename", dtype=DataType.VARCHAR, max_length=100),
        FieldSchema(name="chunk", dtype=DataType.VARCHAR, max_length=1000),
        FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=1024)
    ]
    schema = CollectionSchema(fields, "doc_chunks")
    collection = Collection(collection_name, schema)
    
    # 重新插入数据
    docs_path = "/home/ubuntu/milvus/docs/*.txt"
    all_chunks = []
    for filepath in glob.glob(docs_path):
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
            for i in range(0, len(content), 100):
                chunk = content[i:i+100]
                if len(chunk) > 20:
                    all_chunks.append({
                        'filename': os.path.basename(filepath),
                        'text': chunk
                    })
    
    print(f"重新插入 {len(all_chunks)} 段数据...")
    batch_size = 4
    for i in range(0, len(all_chunks), batch_size):
        batch = all_chunks[i:i+batch_size]
        texts = [item['text'] for item in batch]
        filenames = [item['filename'] for item in batch]
        vectors = [model.encode(t) for t in texts]
        entities = [filenames, texts, vectors]
        collection.insert(entities)
    
    collection.flush()
    index_params = {"metric_type": "L2", "index_type": "FLAT", "params": {}}
    collection.create_index("vector", index_params)
    print(f"插入完成,共 {len(all_chunks)} 段")

collection.load()

# 5. RAG查询函数
def rag_query(user_question):
    query_vec = model.encode(user_question)
    results = collection.search([query_vec], "vector", param={"metric_type": "L2"}, limit=3, output_fields=["filename", "chunk"])
    
    contexts = []
    for hits in results:
        for hit in hits:
            chunk = hit.entity.get('chunk') if hit.entity else ""
            contexts.append(chunk)
    
    context = "\n".join(contexts)
    
    prompt = f"""基于以下参考信息回答问题:
{context}

问题:{user_question}
回答:"""
    
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role": "user", "content": prompt}],
        stream=False
    )
    
    return response.choices[0].message.content, contexts

# 6. 测试
question = "怎么报销发票?"
print(f"问题:{question}")
answer, sources = rag_query(question)

print(f"\n参考片段:")
for i, ctx in enumerate(sources, 1):
    print(f"{i}. {ctx[:80]}...")

print(f"\n生成回答:\n{answer}")

default_server.stop()
print("\nDay 5 完成!端到端RAG跑通")

image.png

踩过的坑

image.png hit.entity是None,说明搜索结果没有正确关联到存储的字段。这是Milvus API版本问题——Lite模式的search返回格式与标准版不同。

image.png Milvus Lite是内存存储,上次default_server.stop()后数据清空。需要重新插入数据,或者改用持久化存储。