最近研究了知识图谱大模型,打算用知识图谱的方式去解决生产上的故障的原因啥的,了解了一下GraphRAG,它是一种结构化、分层检索增强(RAG)的方法,而不是使用纯文本片段的语义的方法。本来是想跟着网上的视频去学,但他们的graphrag版本太低了,所以总是跑失败,新版graphrag尝试了几天,终于跑了起来了,步骤如下 第一步:安装graphrag
pip install graphrag
第二步:
mkdir -p ./ragtest/input
第三步:下载一个book放到input文件夹
curl https://www.gutenberg.org/cache/epub/24022/pg24022.txt -o ./ragtest/input/book.txt
第四步:初始化,
graphrag init --root ./ragtest
初始化后需要重新配置setting.yaml文件,文件配置放到文章末尾了,有需要的可以直接复制 第五步:启动
graphrag index --root ./ragtest
第六步:查询问题
graphrag query --root ./ragtest --method global --query "悟空学到了什么?"
代码如下: 我才用的是千问的模型,需要去官网注册一下apikey,然后把apikey放到.env文件当中,其它settings.yaml的配置如下,然后按照官网的步骤就可以了。
encoding_model: cl100k_base # this needs to be matched to your model!
llm:
api_key: ${GRAPHRAG_API_KEY} # set this in the generated .env file
type: openai_chat # or azure_openai_chat
model: qwen-long
model_supports_json: false # recommended if this is available for your model.
# audience: "https://cognitiveservices.azure.com/.default"
api_base: https://dashscope.aliyuncs.com/compatible-mode/v1
# api_version: 2024-02-15-preview
# organization: <organization_id>
# deployment_name: <azure_model_deployment_name>
parallelization:
stagger: 0.3
# num_threads: 50
async_mode: threaded # or asyncio
embeddings:
async_mode: threaded # or asyncio
vector_store:
type: lancedb # one of [lancedb, azure_ai_search, cosmosdb]
db_uri: 'output\lancedb'
collection_name: default
overwrite: true
llm:
api_key: ${GRAPHRAG_API_KEY}
type: openai_embedding # or azure_openai_embedding
model: text-embedding-v3
api_base: https://dashscope.aliyuncs.com/compatible-mode/v1
# api_version: 2024-02-15-preview
# audience: "https://cognitiveservices.azure.com/.default"
# organization: <organization_id>
# deployment_name: <azure_model_deployment_name>
### Input settings ###
input:
type: file # or blob
file_type: text # or csv
base_dir: "input"
file_encoding: utf-8
file_pattern: ".*\\.txt$"
chunks:
size: 1200
overlap: 100
group_by_columns: [id]
### Storage settings ###
## If blob storage is specified in the following four sections,
## connection_string and container_name must be provided
cache:
type: file # one of [blob, cosmosdb, file]
base_dir: "cache"
reporting:
type: file # or console, blob
base_dir: "logs"
storage:
type: file # one of [blob, cosmosdb, file]
base_dir: "output"
## only turn this on if running `graphrag index` with custom settings
## we normally use `graphrag update` with the defaults
update_index_storage:
# type: file # or blob
# base_dir: "update_output"
### Workflow settings ###
skip_workflows: []
entity_extraction:
prompt: "prompts/entity_extraction.txt"
entity_types: [organization,person,geo,event]
max_gleanings: 1
summarize_descriptions:
prompt: "prompts/summarize_descriptions.txt"
max_length: 500
claim_extraction:
enabled: false
prompt: "prompts/claim_extraction.txt"
description: "Any claims or facts that could be relevant to information discovery."
max_gleanings: 1
community_reports:
prompt: "prompts/community_report.txt"
max_length: 2000
max_input_length: 8000
cluster_graph:
max_cluster_size: 10
embed_graph:
enabled: true # if true, will generate node2vec embeddings for nodes
umap:
enabled: true # if true, will generate UMAP embeddings for nodes (embed_graph must also be enabled)
snapshots:
graphml: true
embeddings: false
transient: false
### Query settings ###
## The prompt locations are required here, but each search method has a number of optional knobs that can be tuned.
## See the config docs: https://microsoft.github.io/graphrag/config/yaml/#query
local_search:
prompt: "prompts/local_search_system_prompt.txt"
global_search:
map_prompt: "prompts/global_search_map_system_prompt.txt"
reduce_prompt: "prompts/global_search_reduce_system_prompt.txt"
knowledge_prompt: "prompts/global_search_knowledge_system_prompt.txt"
drift_search:
prompt: "prompts/drift_search_system_prompt.txt"
reduce_prompt: "prompts/drift_search_reduce_prompt.txt"