qdrant是一个开源向量数据库,安装方法有多种,具体参考:
各种embedding都试了一下,个人感觉汉语匹配准确度都比较一般
前置准备
pip install qdrant-client
# 本地模型选装
pip install numpy==1.24.4
pip install torch==1.13.0
pip install transformers==4.39.0
Qdrant本地无模型用法
这种方式不能指定向量纬度,采用内置Fastembed生成词向量
from qdrant_client import QdrantClient
#用内存启动向量数据库服务
client = QdrantClient(":memory:") # or QdrantClient(path="path/to/db")
# Prepare your documents, metadata, and IDs
docs = ["C罗早已习惯将葡萄牙队的命运扛在自己肩上。", "福州地铁将免费乘车?不实"]
metadata = [
{"source": "Langchain-docs"},
{"source": "Linkedin-docs"},
]
ids = [42, 2]
# Use the new add method
client.add(
collection_name="demo_collection",
documents=docs,
metadata=metadata,
ids=ids
)
search_result = client.query(
collection_name="demo_collection",
query_text="C罗最近怎样呢"
)
print(search_result)
Qdrant本地大模型(haggingface)用法,无需GPU
按照本地方式启动Qdrant,启动路径是../local_qdrant2
from qdrant_client import QdrantClient
client=QdrantClient(path="local_qdrant2")
安装pytorch以及transformers
pip install numpy==1.24.4
pip install torch==1.13.0
pip install transformers==4.39.0
生成embedding的方法,此处需要将huggingface的模型下载到本地,并通过huggingface提供的包transformers进行词向量生成
本例采用的是hfl/chinese-macbert-large模型(1024纬),模型大概1G多
模型下载地址:huggingface.co/hfl/chinese…
下载pytorch模型,并放入model_name指定的目录下(eg:C:\model\chinese-macbert-large)
from transformers import BertModel, BertTokenizer
import torch
# 加载模型和分词器
model_name = "C:\model\chinese-macbert-large"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
def generate_embedding(text):
# 编码文本
inputs = tokenizer(text, return_tensors="pt")
# 获取词向量
with torch.no_grad():
outputs = model(**inputs)
# 获取最后一层的隐藏状态
last_hidden_state = outputs.last_hidden_state
# 提取词向量(例如,[CLS] token 的向量)
cls_vector = last_hidden_state[:, 0, :]
return cls_vector.numpy().flatten()
# 示例文本数组
text = "这是一个示例文本,用于生成词向量。"
# 生成词向量数组
embedding = generate_embedding(text)
print(embedding.shape)
print(embedding)
创建向量数据库表,注意此处代码中的size需要和词向量生成的纬度一致(1024)
from qdrant_client.models import Distance, VectorParams
client.create_collection(
collection_name="example_collection7",
vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
)
插入数据
from qdrant_client.models import PointStruct
operation_info = client.upsert(
collection_name="example_collection7",
wait=True,
points=[
PointStruct(id=1, vector=generate_embedding("中共中央政治局第十六次集体学习"), payload={"text": "中共中央政治局第十六次集体学习"}),
PointStruct(id=2, vector=generate_embedding("王楚钦回应爆冷出局"), payload={"text": "王楚钦回应爆冷出局"}),
PointStruct(id=3, vector=generate_embedding("王楚钦爆冷出局"), payload={"text": "王楚钦爆冷出局"}),
PointStruct(id=4, vector=generate_embedding("樊振东vs黄镇廷"), payload={"text": "樊振东vs黄镇廷"}),
PointStruct(id=5, vector=generate_embedding("全红婵陈芋汐金牌"), payload={"text": "全红婵陈芋汐金牌"}),
PointStruct(id=6, vector=generate_embedding("张雨绮都有俩孩子了"), payload={"text": "张雨绮都有俩孩子了"})
],
)
print(operation_info)
检索向量数据库
search_result = client.search(
collection_name="example_collection7", query_vector=generate_embedding("张雨绮"), limit=2
)
print(search_result)
Qdrant Embedding在线服务(openAI + 字节跳动豆包模型)用法
这种方式只要是符合openAI协议的都可以用,我这里采用的是字节跳动的embedding模型在线服务
详见:console.volcengine.com/ark/region:…
需要自己申请apikey,并且按照token付费(同gpt4)
from qdrant_client import QdrantClient
client=QdrantClient(path="local_qdrant2")
通过openAI SDK生成embedding,api_key和model需要替换成你自己的
import os
from openai import OpenAI
def generate_embedding(text):
# gets API Key from environment variable OPENAI_API_KEY
client = OpenAI(
api_key="your key", # os.environ.get("ARK_API_KEY"),
base_url="https://ark.cn-beijing.volces.com/api/v3",
)
print("----- embeddings request -----")
resp = client.embeddings.create(
model="your model id",
input=[text],
encoding_format="float"
)
return resp.data[0].embedding
# 示例文本数组
text = "这是一个示例文本,用于生成词向量。"
# 生成词向量数组
embedding = generate_embedding(text)
创建表并插入数据,注意纬度设置
from qdrant_client.models import Distance, VectorParams
collection = "example_collection8"
weidu=2560
client.create_collection(
collection_name=collection,
vectors_config=VectorParams(size=weidu, distance=Distance.COSINE),
)
from qdrant_client.models import PointStruct
operation_info = client.upsert(
collection_name=collection,
wait=True,
points=[
PointStruct(id=1, vector=generate_embedding("中共中央政治局第十六次集体学习"), payload={"text": "中共中央政治局第十六次集体学习"}),
PointStruct(id=2, vector=generate_embedding("王楚钦回应爆冷出局"), payload={"text": "王楚钦回应爆冷出局"}),
PointStruct(id=3, vector=generate_embedding("王楚钦爆冷出局"), payload={"text": "王楚钦爆冷出局"}),
PointStruct(id=4, vector=generate_embedding("樊振东vs黄镇廷"), payload={"text": "樊振东vs黄镇廷"}),
PointStruct(id=5, vector=generate_embedding("全红婵陈芋汐金牌"), payload={"text": "全红婵陈芋汐金牌"}),
PointStruct(id=6, vector=generate_embedding("张雨绮都有俩孩子了"), payload={"text": "张雨绮都有俩孩子了"})
],
)
print(operation_info)
检索内容
search_result = client.search(
collection_name=collection, query_vector=generate_embedding("体育"), limit=5
)
print(search_result)
Qdrant cloud用法
登录cloud.qdrant.io/,并注册一个免费的cloud空间(默认4G硬盘 1G内存 0.5核CPU)
同时也会生成一个Qdrant服务连接和密钥(请记牢,无法再次查看)
url、api_key换成自己的
实例采用内置的embedding生成
from qdrant_client import QdrantClient
qdrant_client = QdrantClient(
url="your urls",
api_key="your key",
)
collection_name="my_collection"
from qdrant_client.models import Distance, VectorParams
套路和本地无模型一样
from qdrant_client.models import Distance, VectorParams
docs = ["中共中央政治局第十六次集体学习", "王楚钦回应爆冷出局", "王楚钦爆冷出局", "樊振东vs黄镇廷", "全红婵陈芋汐金牌", "张雨绮都有俩孩子了"]
metadata = [
{"source": "weibo-docs"},
{"source": "weibo-docs"},
{"source": "weibo-docs"},
{"source": "weibo-docs"},
{"source": "weibo-docs"},
{"source": "weibo-docs"},
]
ids = [1, 2, 3, 4, 5, 6]
# Use the new add method
qdrant_client.add(
collection_name="my_collection2",
documents=docs,
ids=ids
)
search_result = qdrant_client.query(
collection_name="my_collection2",
query_text="政治"
)
print(search_result)