大模型对接知识库,通常我们对接知识库较常用的时再Cherry-Studio 或chatbox中设置知识库。通过以上工具降低了使用门槛。但我们项目应用中通常希望能够集中提供知识库并统一提供调用接口。 以下代码运行环境为ollama+deepseek+milvus环境下运行。ollama安装直接下载工具即可。 1、准备环境验证 ollama run deepseek-r1:1.5b 2、attu 访问 milvus正常
以下代码中并未导入知识库,因此知识库是空库,具体代码大家自己补充
import requests
import json
from pymilvus import connections, CollectionSchema, FieldSchema, DataType, Collection
from pymilvus import utility # Import the utility module
# 环境准备部分的函数:获取 Ollama 嵌入向量
def get_embedding(text):
url = "http://localhost:11434/api/embeddings"
data = {"model": "deepseek-r1:1.5b", "prompt": text}
response = requests.post(url, json=data)
if response.status_code == 200:
embedding = json.loads(response.content)['embedding']
return embedding
else:
raise Exception(f"Failed to get embedding: {response.status_code}")
# 环境准备部分的函数:获取或创建 Milvus 集合
def get_or_create_milvus_collection():
# 连接到 Milvus 服务
if not connections.has_connection("default"):
connections.connect("default", host="127.0.0.1", port="19530")
collection_name = 'question_answer_csv1'
# 检查集合是否存在
if not utility.has_collection(collection_name): # Use utility.has_collection instead
fields = [
FieldSchema(name='id', dtype=DataType.VARCHAR, description='问题的 id', max_length=500, is_primary=True, auto_id=False),
FieldSchema(name='vec', dtype=DataType.FLOAT_VECTOR, description='问题的嵌入向量', dim=1536)
]
schema = CollectionSchema(fields=fields, description='问答搜索集合')
collection = Collection(name=collection_name, schema=schema)
index_params = {
'metric_type': 'L2',
'index_type': 'IVF_FLAT',
'params': {'nlist': 2048}
}
collection.create_index(field_name='vec', index_params=index_params)
else:
collection = Collection(name=collection_name)
print("Collection already exists.========================")
collection.load()
return collection
# 环境准备部分的函数:插入数据到 Milvus
def insert_data(collection, id, text, embedding):
# Convert the id to a string
id_str = str(id)
# Only insert id and embedding, which match the schema
mr = collection.insert([[id_str], [embedding]])
print(f"Inserted data: {mr}")
# 环境准备部分的函数:搜索相似文本
def search_similar_texts(collection, query_text, top_k=5):
query_embedding = get_embedding(query_text)
search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
result = collection.search(
data=[query_embedding],
anns_field="vec", # 修正为 "vec",与 schema 中的字段名一致
param=search_params,
limit=top_k,
output_fields=[] # 原代码中 output_fields=["text"] 可能有误,因为 schema 中没有 "text" 字段
)
return result
# 整合到问答系统的函数
def get_answer_from_ollama(query_text, context_texts):
context = " ".join(context_texts)
url = "http://localhost:11434/api/generate"
data = {
"model": "deepseek-r1:1.5b", # 请替换为你实际使用的模型名
"prompt": f"上下文信息: {context}。问题: {query_text}"
}
response = requests.post(url, json=data)
if response.status_code == 200:
try:
# Try to parse the response content as JSON
response_content = response.content.decode('utf-8')
# Split the content by lines in case there are multiple JSON objects
lines = response_content.strip().split('\n')
for line in lines:
if line:
try:
json_data = json.loads(line)
if 'response' in json_data:
return json_data['response']
except json.JSONDecodeError:
continue
raise Exception("No valid JSON data with 'response' field found.")
except Exception as e:
print(f"Error parsing JSON: {e}")
print(f"Response content: {response_content}")
else:
raise Exception(f"Failed to get answer: {response.status_code}")
# 主函数,整合整个流程
def main():
# 1. 获取或创建 Milvus 集合
collection = get_or_create_milvus_collection()
# 2. 数据准备示例
texts = ["这是一段示例文本 1", "这是一段示例文本 2"]
for i, text in enumerate(texts):
embedding = get_embedding(text)
insert_data(collection, i, text, embedding)
# 3. 查询知识库
query = "示例查询文本"
similar_texts = search_similar_texts(collection, query)
# 4. 整合到问答系统
# 这里需要根据实际情况处理 similar_texts 得到 context_texts
context_texts = []
answer = get_answer_from_ollama(query, context_texts)
print(f"问题: {query}")
print(f"答案: {answer}")
if __name__ == "__main__":
main()