1、先把txt文档数据持久化到本地
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
# 加载文件夹中的txt类型的文件
loader = DirectoryLoader('D:\data', glob='**/*.txt')
# 将数据转成 document 对象,每个文件会作为一个 document
documents = loader.load()
# 初始化加载器
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0)
# 切割加载的 document
split_docs = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
# 将 document 通过 openai 的 embeddings 对象计算 embedding 向量信息并临时存入 Chroma 向量数据库,用于后续匹配查询
docsearch = Chroma.from_documents(split_docs, embeddings, persist_directory="D:/vector_store")
docsearch.persist()
执行过程缺少对应的模块,挨个install就好
执行成功会看到D盘的vector_store文件夹生成了数据
2、构建问答对象,进行问答
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain import OpenAI
from langchain.chains import RetrievalQA
import openai
import os
os.environ["OPENAI_API_KEY"] = "sk-kIsxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
openai.api_key = os.getenv("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings()
docsearch = Chroma(persist_directory="D:/vector_store", embedding_function=embeddings)
# 创建问答对象
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever(), return_source_documents=True)
# 进行问答
def doc_txt(search_query):
result = qa({"query": search_query})
return result['result']
prompts = [
# '实名成功就是认证成功了吗',
'阿里巴巴现任CEO是谁',
'马云是谁',
'美国总统是谁',
]
for prompt in prompts:
print('问:'+prompt+'\n答:'+doc_txt(prompt)+'\n')
3、输出结果