9.langchain 入门到放弃(四) Text embedding models

　　langchain 入门到放弃(四) Text embedding models

　　Embeddings 类是设计用于与文本嵌入模型交互的类。有很多嵌入模型提供者（OpenAI、Cohere、Hugging Face 等）——此类旨在为所有这些提供者提供标准接口。

　　嵌入创建一段文本的矢量表示。这很有用，因为它意味着我们可以在向量空间中思考文本，并执行语义搜索之类的操作，在向量空间中查找最相似的文本片段。

　　LangChain 中的 Embeddings 基类提供了两种方法：一种用于嵌入文档，另一种用于嵌入查询。前者采用多个文本作为输入，而后者采用单个文本。将它们作为两种单独方法的原因是，某些嵌入提供程序对文档（要搜索的）与查询（搜索查询本身）有不同的嵌入方法。

from langchain_community.document_loaders import DirectoryLoader, CSVLoader, TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_text_splitters import CharacterTextSplitter


# 加载Directory下文件的函数
def load_file(file_path):
    if file_path.endswith('.csv'):
        return CSVLoader(file_path)
    elif file_path.endswith('.txt'):
        return TextLoader(file_path)
    else:
        raise ValueError(f"Unsupported file type: {file_path}")
      
directory = DirectoryLoader(path="../source", loader_cls=load_file)

docs = directory.load()

txt_spilt = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs_spilt = txt_spilt.split_documents(docs)
# 初始化 HuggingFace 的 embeddings 对象，使用本地的离线模型
embeddings = HuggingFaceEmbeddings(model_name="../localLLM/all-MiniLM-L6-v2",
                                   model_kwargs={'device': 'cpu'})
# 存储到本地vector_store目录下Chroma向量数据库,
embeddingDB = Chroma.from_documents(docs_spilt, embeddings,
                                    persist_directory="../vector_store",collection_name="my_chroma")
# 确保数据持久化
embeddingDB.persist()