LangChain+OpenAi实现本地知识库问答质量的评估这段代码首先加载了一些问答对，然后从文档中自动生成更多的问答

注意：实测发现最终结果的评估GP4准确，GP3.5不能正确理解QAEvalChain中默认的评估prompt

当然可以，下面是该代码的简化流程：

环境设置：设定OpenAI的API密钥和开启调试模式。
定义并初始化自定义模型：创建一个自定义的语言模型CustomLLM并初始化。
数据加载：加载手工整理的问答对和从"data"目录中的文档数据。
自动问答生成：
- 设定一个模板和解析器来帮助从文档中自动生成问答对。
- 使用GPT4、已定义的模板和解析器，从部分文档中生成新的问答对。
- 将新生成的问答对添加到手工整理的问答列表中。
模型评估：
- 对于每一个手工整理和自动生成的问答对，使用CustomLLM模型来获取预测答案。
- 使用GPT4模型对预测的答案进行评估，得到评估结果。
输出评估结果：展示问题、真实答案、预测答案以及评估得分。

简言之，这段代码首先加载了一些问答对，然后从文档中自动生成更多的问答对。接下来，它使用自定义模型预测答案并评估这些预测的答案的准确性。

from __future__ import annotations
import os
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch
import langchain
from typing import Any, List, Mapping, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
import requests
from langchain.prompts import PromptTemplate


os.environ["OPENAI_API_KEY"] = "xxx"


langchain.debug = True


class CustomLLM(LLM):
    
    @property
    def _llm_type(self) -> str:
        return "custom"

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
    ) -> str:
        if stop is not None:
            raise ValueError("stop kwargs are not permitted.")

        data = {
            "prompt": prompt,
            "stream": False
        }
        
        url = 'http://xxxxx/api/chat'
        headers = {
            'accept': 'application/json',
            'Content-Type': 'application/json'
        }
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()  # Raise exception if the request failed
        print(response.json())
        return response.json()['result']

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {"api_key": 'xxxxx'}

qa = CustomLLM()


llm = ChatOpenAI(temperature = 0.0)


# 人工整理的问答
examples = [
    {
        "query": "问题xxx",
        "answer": "答案xxx"
    }
]


# 从data目录加载txt文件
data = []
for file in os.listdir("data"):
    with open(os.path.join("data", file), "r") as f:
        data.append(f.read())

from langchain.output_parsers.regex import RegexParser
template = """You are a teacher coming up with questions to ask on a quiz. 
Given the following document, please generate a question and answer based on that document. 

Example Format:
<Begin Document>
...
<End Document>
QUESTION: question here
ANSWER: answer here

These questions should be detailed and be based explicitly on information in the document. Begin!

<Begin Document>
{doc}
<End Document>"""
output_parser = RegexParser(
    regex=r"QUESTION: (.*?)\n+ANSWER: (.*)", output_keys=["query", "answer"]
)

PROMPT = PromptTemplate(
    input_variables=["doc"], template=template, output_parser=output_parser
)

# custom qa generate chain

from typing import Any
from langchain.base_language import BaseLanguageModel
from langchain.chains.llm import LLMChain

class CustomQAGenerateChain(LLMChain):
    """LLM Chain specifically for generating examples for question answering."""

    @classmethod
    def from_llm(cls, llm: BaseLanguageModel, **kwargs: Any) -> CustomQAGenerateChain:
        """Load QA Generate Chain from LLM."""
        return cls(llm=llm, prompt=PROMPT, **kwargs)

# 根据文档自动生成问答
from langchain.evaluation.qa import QAGenerateChain
example_gen_chain = CustomQAGenerateChain.from_llm(ChatOpenAI())
new_examples = example_gen_chain.apply_and_parse(
    [{"doc": t} for t in data[:5]]
)

examples += new_examples

print('examples: ', examples)


# llm assisted evaluation（利用llm自动评估问答）
langchain.debug = False
# 根据query，查询结果并生成问题
predictions = []
for eg in examples:
    result = qa(eg["query"])
    predictions.append({
        "query": eg["query"],
        "answer": eg["answer"],
        "result": result
    })


from langchain.evaluation.qa import QAEvalChain
llm = ChatOpenAI(temperature=0)
eval_chain = QAEvalChain.from_llm(llm)
langchain.debug = True
# 将根据文档生成的问答结果和根据查询生成的问答结果对比，评估结果
graded_outputs = eval_chain.evaluate(examples, predictions)

for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + predictions[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    print("Predicted Grade: " + graded_outputs[i]['text'])
    print()