# 引言
在当今的数据驱动世界中,高效的数据存储和检索对于构建智能应用至关重要。Yellowbrick作为一种高性能的向量数据库,可以与ChatGPT结合,通过Retrieval Augmented Generation (RAG)技术,打造出更加智能的聊天机器人。本教程将带你一步步实现这一目标。
# 主要内容
## 初步准备
### 所需条件
- Yellowbrick账户
- OpenAI API密钥
### 安装依赖库
```bash
%pip install --upgrade --quiet langchain langchain-openai langchain-community psycopg2-binary tiktoken
Part 1: 创建基础的ChatGPT聊天机器人
from langchain.chains import LLMChain
from langchain_core.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_openai import ChatOpenAI
system_template = """If you don't know the answer, Make up your best guess."""
messages = [
SystemMessagePromptTemplate.from_template(system_template),
HumanMessagePromptTemplate.from_template("{question}"),
]
prompt = ChatPromptTemplate.from_messages(messages)
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, max_tokens=256)
chain = LLMChain(llm=llm, prompt=prompt, verbose=False)
def print_result_simple(query):
result = chain(query)
print(f"### Question: {query} ### Answer: {result['text']}")
print_result_simple("How many databases can be in a Yellowbrick Instance?")
Part 2: 在Yellowbrick中创建嵌入表
import psycopg2
yellowbrick_connection_string = "postgres://[username]:[password]@[host]:5432/[database]"
embedding_table = "my_embeddings"
try:
conn = psycopg2.connect(yellowbrick_connection_string)
cursor = conn.cursor()
create_table_query = f"""
CREATE TABLE IF NOT EXISTS {embedding_table} (
doc_id uuid NOT NULL,
embedding_id smallint NOT NULL,
embedding double precision NOT NULL
)
DISTRIBUTE ON (doc_id);
"""
cursor.execute(create_table_query)
conn.commit()
cursor.close()
conn.close()
print(f"Table '{embedding_table}' created successfully!")
except psycopg2.Error as e:
print(f"Error: {e}")
Part 3: 提取文档以供索引
conn = psycopg2.connect(yellowbrick_connection_string)
cursor = conn.cursor()
query = f"SELECT path, document FROM yellowbrick_documentation"
cursor.execute(query)
yellowbrick_documents = cursor.fetchall()
cursor.close()
conn.close()
Part 4: 用向量加载Yellowbrick
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Yellowbrick
documents = [Document(page_content=document[1]) for document in yellowbrick_documents]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
split_docs = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
vector_store = Yellowbrick.from_documents(
documents=split_docs,
embedding=embeddings,
connection_string=yellowbrick_connection_string,
table=embedding_table,
)
Part 5: 创建使用Yellowbrick的聊天机器人
from langchain.chains import RetrievalQAWithSourcesChain
system_template = """Use the following pieces of context to answer the users question.
----------------{summaries}"""
messages = [
SystemMessagePromptTemplate.from_template(system_template),
HumanMessagePromptTemplate.from_template("{question}"),
]
prompt = ChatPromptTemplate.from_messages(messages)
vector_store = Yellowbrick(OpenAIEmbeddings(), yellowbrick_connection_string, embedding_table)
chain = RetrievalQAWithSourcesChain.from_chain_type(
llm=ChatOpenAI(model_name="gpt-3.5-turbo"),
retriever=vector_store.as_retriever(search_kwargs={"k": 5}),
return_source_documents=True,
chain_type_kwargs={"prompt": prompt},
)
def print_result_sources(query):
result = chain(query)
print(f"### Question: {query} ### Answer: {result['answer']} ### Sources: {result['sources']}")
print_result_sources("How many databases can be in a Yellowbrick Instance?")
Part 6: 使用索引提升性能
lsh_params = Yellowbrick.IndexParams(Yellowbrick.IndexType.LSH, {"num_hyperplanes": 8, "hamming_distance": 2})
vector_store.create_index(lsh_params)
常见问题和解决方案
- 网络限制问题:在某些地区访问API时,可能需要考虑使用API代理服务,提高访问稳定性。
- 性能问题:随着文档数量增多,索引可以大幅提升查询速度。
总结和进一步学习资源
通过本教程,您已经学会如何将Yellowbrick作为向量存储来增强ChatGPT的响应能力。可以进一步学习如何配置和调优Yellowbrick的索引以满足不同需求。
参考资料
如果这篇文章对你有帮助,欢迎点赞并关注我的博客。您的支持是我持续创作的动力!
---END---