怎样使用函数与知识库
这是个笔记的创建是基于参数生成笔记的基础上,创建一个可以访问知识库的代理和两个可以根据用户需求调用的函数。 我们要创建一个使用arXiv的数据的代理来回答关于学术主题的问题。有两个处理函数: - get_articles: 一个返回arXiv上的主题文章的函数,并使用链接列出摘要 - read_article_and_summarize: 采用之前搜索的文章,读取整篇文章,并总结核心论点、证据和结论
这将使您适应多功能工作流程,可以从多种服务中进行选择,其中第一个函数中的一些数据被持久化以供第二个函数使用。
演练
这个笔记会带你完成以下工作流程:
- 实用搜索:创建两个能从arXiv获得答案的函数
- 配置代理:构建代理行为,评估对某个函数的需求,如果需要,调用该函数并将结果返回给代理
- arXiv会话:在实时会话时把所有信息放在一起
# 导入 包
!pip install scipy
!pip install tenacity
!pip install tiktoken==0.3.3
!pip install termcolor
!pip install openai
!pip install arxiv
!pip install pandas
!pip install PyPDF2
!pip install tqdm
import os
import arxiv
import ast
import concurrent
import json
import os
import pandas as pd
import tiktoken
from csv import writer
from IPython.display import display, Markdown, Latex
from openai import OpenAI
from PyPDF2 import PdfReader
from scipy import spatial
from tenacity import retry, wait_random_exponential, stop_after_attempt
from tqdm import tqdm
from termcolor import colored
GPT_MODEL = "gpt-3.5-turbo-0613"
EMBEDDING_MODEL = "text-embedding-ada-002"
client = OpenAI()
1. 实用搜索
我们将首先设置一些实用程序来支持我们的两个函数。
下载的论文将存储在一个目录中(这里我们使用./data/papers)。我们创建了一个文件arxiv_library.csv来存储下载论文的嵌入和详细信息,以便使用summarize_text检索。
directory = './data/papers'
# 检查目录是否存在
if not os.path.exists(directory):
# 如果目录不存在,则创建并同时创建中间的所有目录
os.makedirs(directory)
print(f"目录 '{directory}' 创建成功.")
else:
# 如果已经存在,则打印提示
print(f"目录 '{directory}' 已经存在.")
# 设置一个目录来储存下载的论文
data_dir = os.path.join(os.curdir, "data", "papers")
paper_dir_filepath = "./data/arxiv_library.csv"
# 创建一个dataframe 用来存储下载文件
df = pd.DataFrame(list())
df.to_csv(paper_dir_filepath)
@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3))
def embedding_request(text):
# 创建embedding
response = client.embeddings.create(input=text, model=EMBEDDING_MODEL)
return response
@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3))
def get_articles(query, library=paper_dir_filepath, top_k=5):
"""
根据用户的查询返回前k个文章,存储相关信息。
下载文件并存储到arxiv_library.csv文件,并通过read_article_and_summarize进行检索
"""
client = arxiv.Client()
search = arxiv.Search(
query = "quantum",
max_results = 10,
sort_by = arxiv.SortCriterion.SubmittedDate
)
result_list = []
for result in client.results(search):
result_dict = {}
result_dict.update({"title": result.title})
result_dict.update({"summary": result.summary})
# 获得第一个URL
result_dict.update({"article_url": [x.href for x in result.links][0]})
result_dict.update({"pdf_url": [x.href for x in result.links][1]})
result_list.append(result_dict)
# 在库文件里存储参考信息
response = embedding_request(text=result.title)
file_reference = [
result.title,
result.download_pdf(data_dir),
response.data[0].embedding,
]
# 写入文件
with open(library, "a") as f_object:
writer_object = writer(f_object)
writer_object.writerow(file_reference)
f_object.close()
return result_list
# 测试
result_output = get_articles("ppo reinforcement learning")
result_output[0]
{'title': 'Entanglement entropy and deconfined criticality: emergent SO(5) symmetry and proper lattice bipartition',
'summary': "We study the R\\'enyi entanglement entropy (EE) of the two-dimensional $J$-$Q$\nmodel, the emblematic quantum spin model of deconfined criticality at the phase\ntransition between antiferromagnetic and valence-bond-solid ground states.\nQuantum Monte Carlo simulations with an improved EE scheme reveal critical\ncorner contributions that scale logarithmically with the system size, with a\ncoefficient in remarkable agreement with the form expected from a large-$N$\nconformal field theory with SO($N=5$) symmetry. However, details of the\nbipartition of the lattice are crucial in order to observe this behavior. If\nthe subsystem for the reduced density matrix does not properly accommodate\nvalence-bond fluctuations, logarithmic contributions appear even for\ncorner-less bipartitions. We here use a $45^\\circ$ tilted cut on the square\nlattice. Beyond supporting an SO($5$) deconfined quantum critical point, our\nresults for both the regular and tilted cuts demonstrate important microscopic\naspects of the EE that are not captured by conformal field theory.",
'article_url': 'http://arxiv.org/abs/2401.14396v1',
'pdf_url': 'http://arxiv.org/pdf/2401.14396v1'}
"""
根据查询字符串(query)与一个DataFrame(df)中的嵌入(embedding)之间的相关度(relatedness)来排序字符串
relatedness_fn:用于计算两个嵌入向量之间的相关度。默认是一个使用余弦距离来计算相关度的lambda函数,其中1减去余弦距离作为相关度的度量。
余弦距离越接近1,表示两个向量越相似;
余弦距离越接近-1,表示两个向量越不相似
top_n: 一个整数,表示要返回的与查询最相关的字符串的数量
"""
def strings_ranked_by_relatedness(
query: str,
df: pd.DataFrame,
relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
top_n: int = 100,
) -> list[str]:
"""
返回字符串和关联列表,从关联度最大到最小排序
"""
# 获得embedding
query_embedding_response = embedding_request(query)
query_embedding = query_embedding_response.data[0].embedding
strings_and_relatednesses = [
(row["filepath"], relatedness_fn(query_embedding, row["embedding"]))
for i, row in df.iterrows()
]
strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
strings, relatednesses = zip(*strings_and_relatednesses)
return strings[:top_n]
def read_pdf(filepath):
"""
获取PDF的文件路径并返回PDF内容的字符串
"""
# 创建PdfReader对象
reader = PdfReader(filepath)
pdf_text = ""
page_number = 0
for page in reader.pages:
page_number += 1
pdf_text += page.extract_text() + f"\nPage Number: {page_number}"
return pdf_text
# 将文本分成大小为n的小块,最好在句子末尾结束
def create_chunks(text, n, tokenizer):
"""从提供的文本中返回连续的n个大小的块."""
tokens = tokenizer.encode(text)
i = 0
while i < len(tokens):
# 查询最近的句子,在0.5*n 和 1.5*n 范围内的tokens
j = min(i + int(1.5 * n), len(tokens))
while j > i + int(0.5 * n):
# 解码tokens并检查句子停止或新的一行
chunk = tokenizer.decode(tokens[i:j])
if chunk.endswith(".") or chunk.endswith("\n"):
break
j -= 1
# 如果句子的结束没有找到,使用n个tokens作为一块
if j == i + int(0.5 * n):
j = min(i + n, len(tokens))
yield tokens[i:j]
i = j
def extract_chunk(content, template_prompt):
"""
使用提示词作为输入,返回一块摘要文本
"""
prompt = template_prompt + content
response = client.chat.completions.create(
model=GPT_MODEL, messages=[{"role": "user", "content": prompt}], temperature=0
)
return response.choices[0].message.content
def summarize_text(query):
"""
实现下列功能:
- 读取arxiv_library.csv里的embeddings
- 对用户的查询查找最近的文件
- 从文件中抓取文本并将其块化
- 并行摘要每一个块
- 完成一个最终的摘要返回
"""
# 指示递归摘要应如何处理输入文件的提示符
summary_prompt = """从一篇学术论文中总结这篇文章。用推理提炼出任何要点.\n\n内容:"""
# 如果库为空(尚未执行任何搜索),则执行一次搜索并下载结果
library_df = pd.read_csv(paper_dir_filepath).reset_index()
if len(library_df) == 0:
print("如果没有搜索到论文首先下载.")
get_articles(query)
print("论文下载记数")
library_df = pd.read_csv(paper_dir_filepath).reset_index()
library_df.columns = ["title", "filepath", "embedding"]
library_df["embedding"] = library_df["embedding"].apply(ast.literal_eval)
strings = strings_ranked_by_relatedness(query, library_df, top_n=1)
print("从论文获取文本块")
pdf_text = read_pdf(strings[0])
# 初化分词器
tokenizer = tiktoken.get_encoding("cl100k_base")
results = ""
# 将文档分成1500个令牌块
chunks = create_chunks(pdf_text, 1500, tokenizer)
text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
print("概要每个文本块")
# 并行处理摘要
with concurrent.futures.ThreadPoolExecutor(
max_workers=len(text_chunks)
) as executor:
futures = [
executor.submit(extract_chunk, chunk, summary_prompt)
for chunk in text_chunks
]
with tqdm(total=len(text_chunks)) as pbar:
for _ in concurrent.futures.as_completed(futures):
pbar.update(1)
for future in futures:
data = future.result()
results += data
# 最终摘要
print("汇总所有摘要")
response = client.chat.completions.create(
model=GPT_MODEL,
messages=[
{
"role": "user",
"content": f"""根据从一篇学术论文中摘录的要点整理出一篇摘要.
摘要应突出核心论点、结论和证据,并回答用户的疑问.
用户查询: {query}
摘要应该以项目列表的形式按照核心论点、证据和结论的标题来组织.
关键点:\n{results}\n摘要:\n""",
}
],
temperature=0,
)
return response
# 测试
chat_test_response = summarize_text("PPO reinforcement learning sequence generation")
100%|██████████| 15/15 [00:08<00:00, 1.76it/s]
print(chat_test_response.choices[0].message.content)
The academic paper discusses the unique decomposition of generators of completely positive dynamical semigroups in infinite dimensions. The main result of the paper is that for any separable complex Hilbert space, any trace-class operator B that does not have a purely imaginary trace, and any generator L of a norm-continuous one-parameter semigroup of completely positive maps, there exists a unique bounded operator K and a unique completely positive map Φ such that L=K(·) + (·)K∗+ Φ. The paper also introduces a modified version of the Choi formalism, which relates completely positive maps to positive semi-definite operators, and characterizes when this correspondence is injective and surjective. The paper concludes by discussing the challenges and questions that arise when generalizing the results to non-separable Hilbert spaces.
2. 配置代理
这一步我们要创建代理,包含一个支持API多轮调用的Conversation类,还有一些Python函数库,以使补全API和知识库进行交互
@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3))
def chat_completion_request(messages, functions=None, model=GPT_MODEL):
try:
response = client.chat.completions.create(
model=model,
messages=messages,
functions=functions,
)
return response
except Exception as e:
print("Unable to generate ChatCompletion response")
print(f"Exception: {e}")
return e
class Conversation:
def __init__(self):
self.conversation_history = []
def add_message(self, role, content):
message = {"role": role, "content": content}
self.conversation_history.append(message)
def display_conversation(self, detailed=False):
role_to_color = {
"system": "red",
"user": "green",
"assistant": "blue",
"function": "magenta",
}
for message in self.conversation_history:
print(
colored(
f"{message['role']}: {message['content']}\n\n",
role_to_color[message["role"]],
)
)
# 初始化get_articles和read_article_and_summarize函数
arxiv_functions = [
{
"name": "get_articles",
"description": """使用这个函数从arXiv获得学术论文来回答用户的问题.""",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": f"""
用户用JSON对象来查询。对响应进行概述,并返回文章的URL引用
""",
}
},
"required": ["query"],
},
},
{
"name": "read_article_and_summarize",
"description": """
使用这个函数读取所有论文,并为所有用户提供一个摘要。在会话中必须首先调用get_articles,否则不要调用这个函数
""",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": f"""
用普通文本方式来描述用户所进行的查询
""",
}
},
"required": ["query"],
},
}
]
def chat_completion_with_function_execution(messages, functions=[None]):
"""该函数调用ChatCompletion API,并带有添加函数的选项"""
response = chat_completion_request(messages, functions)
full_message = response.choices[0]
if full_message.finish_reason == "function_call":
print(f"已经产生函数请求, 调用函数")
return call_arxiv_function(messages, full_message)
else:
print(f"未调用函数,将响应返回给用户")
return response
def call_arxiv_function(messages, full_message):
"""函数调用当模型认为有必要时执行函数调用。当前通过向if语句添加子句来扩展."""
if full_message.message.function_call.name == "get_articles":
try:
parsed_output = json.loads(
full_message.message.function_call.arguments
)
print("获得返回结果")
results = get_articles(parsed_output["query"])
except Exception as e:
print(parsed_output)
print(f"函数执行失败")
print(f"Error message: {e}")
messages.append(
{
"role": "function",
"name": full_message.message.function_call.name,
"content": str(results),
}
)
try:
print("获得搜索结果,返回摘要内容")
response = chat_completion_request(messages)
return response
except Exception as e:
print(type(e))
raise Exception("函数返回失败")
elif (
full_message.message.function_call.name == "read_article_and_summarize"
):
parsed_output = json.loads(
full_message.message.function_call.arguments
)
print("找到并读取论文")
summary = summarize_text(parsed_output["query"])
return summary
else:
raise Exception("函数调用不存在")
3. arXiv 会话
测试对话过程中的函数输出:
# 开始使用系统消息
paper_system_message = """你是arXivGPT,一个有用的助手拉取学术论文来回答用户的问题。
你要清楚地总结论文,这样客户就可以决定读哪一篇来回答他们的问题。
您总是提供文章的URL和标题,以便用户能够理解论文的名称并点击访问它。
开始吧!"""
paper_conversation = Conversation()
paper_conversation.add_message("system", paper_system_message)
# 添加用户消息
paper_conversation.add_message("user", "嗨,PPO强化学习是如何工作的??")
chat_response = chat_completion_with_function_execution(
paper_conversation.conversation_history, functions=arxiv_functions
)
assistant_message = chat_response.choices[0].message.content
paper_conversation.add_message("assistant", assistant_message)
display(Markdown(assistant_message))
# 添加另一个用户消息以引导我们的系统使用每二个工具
paper_conversation.add_message(
"user",
"你能帮我读一下PPO序列生成论文并给我一个总结吗",
)
updated_response = chat_completion_with_function_execution(
paper_conversation.conversation_history, functions=arxiv_functions
)
display(Markdown(updated_response.choices[0].message.content))
Function generation requested, calling function
Finding and reading paper
Chunking text from paper
Summarizing each chunk of text
100%|██████████| 15/15 [00:09<00:00, 1.67it/s]