chroma中自定义Embeddings的几种方法

1,313 阅读1分钟

1.使用langchain,版本要高一点
这里的参数根据实际情况进行调整,我使用的是azure的服务

**

import os
os.environ["OPENAI_API_KEY"] = "you key"
os.environ["OPENAI_API_BASE"] = "https://xxx.openai.azure.com/"
os.environ["OPENAI_API_TYPE"] = 'azure'
os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview"

import chromadb
from langchain.embeddings import OpenAIEmbeddings
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings

chroma_client = chromadb.Client()
AzureEmbeddings = OpenAIEmbeddings(deployment="xxx-embedding")
class AzureEmbeddingFunction(EmbeddingFunction):
    def __call__(self, texts: Documents) -> Embeddings:
        embeddings = [AzureEmbeddings.embed_query(x) for x in texts]
        return embeddings

collection = chroma_client.create_collection(name="my_collection", embedding_function=AzureEmbeddingFunction())
print(collection)

2.使用openai直接调用azure服务

**

import os
os.environ["OPENAI_API_KEY"] = "you key"
os.environ["OPENAI_API_BASE"] = "https://xxx.openai.azure.com/"
os.environ["OPENAI_API_TYPE"] = 'azure'
os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview"

import openai
import chromadb
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings

chroma_client = chromadb.Client()

def embed_query(content):
    data_embedding_res = openai.Embedding.create(
        engine="xxx-embedding",
      input=content
    )
    return data_embedding_res
class AzureEmbeddingFunction(EmbeddingFunction):
    def __call__(self, texts: Documents) -> Embeddings:
        embeddings = [embed_query(x)['data'][0]['embedding'] for x in texts]
        return embeddings

collection = chroma_client.create_collection(name="my_collection", embedding_function=AzureEmbeddingFunction())
print(collection)

3.使用本地模型进行Embeddings
text2vec-base-chinese自己从huggingface.co/shibing624/…下载

**

import chromadb
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings

chroma_client = chromadb.Client()
model_path = r'D:\PycharmProjects\example\models\text2vec-base-chinese'
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(model_name_or_path=model_path)

class MyEmbeddingFunction(EmbeddingFunction):
    def __call__(self, texts: Documents) -> Embeddings:
        embeddings = [model.encode(x) for x in texts]
        return embeddings

collection = chroma_client.create_collection(name="my_collection", embedding_function=MyEmbeddingFunction())
print(collection)

4.使用本地模型进行直接生成方法

**

import chromadb
chroma_client = chromadb.Client()
model_path = r'D:\PycharmProjects\example\models\text2vec-base-chinese'
from chromadb.utils import embedding_functions
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_path)

collection = chroma_client.create_collection(name="my_collection", embedding_function=sentence_transformer_ef)
print(collection)