RAG系统之预处理 md 文档

506 阅读2分钟

文档例子

langchain官方文档

需求

把文档中的内容按照标头拆分并建立索引,文档中的超链接也要加载、拆分并建立索引。

处理思路

  1. 读取 markdown 格式的文件,按照标头进行切分
  2. 读取其它链接的内容,按照字符数进行切分
  3. 向量化并存入 Chroma 数据库
  4. 采用 langchain 框架

读取 markdown 格式的文件,按照标头进行切分

from langchain_text_splitters import MarkdownHeaderTextSplitter

# 读取 Markdown 文档
with open("README.md", "r") as file:
    md_content = file.read()

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
docs = markdown_splitter.split_text(md_content)

读取其它链接的内容,按照字符数进行切分

import requests
from bs4 import BeautifulSoup
from markdown import markdown
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 转换 Markdown 为 HTML
html_content = markdown(md_content)

# 使用 BeautifulSoup 提取所有链接
soup = BeautifulSoup(html_content, "html.parser")
links = [a["href"] for a in soup.find_all("a", href=True)]


# 抓取所有链接的网页内容
def fetch_url_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text  # 返回网页 HTML 内容
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None


# 获取所有链接内容
link_contents = []
for link in links:
    content = fetch_url_content(link)
    if content:
        link_contents.append(content)


# 使用 Langchain 进行文本分块
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=100, is_separator_regex=True
)
docs.extend(text_splitter.create_documents(link_contents))

向量化并存入 Chroma 数据库

import os

os.environ["OPENAI_API_BASE"] = "https://api.wlai.vip/v1"
os.environ["OPENAI_API_KEY"] = "sk-dFArvEEXCSPqs8gB6a1056Aa0f104a558d30Ef747976D63b"

from langchain_openai import OpenAIEmbeddings

api_key = os.getenv("OPENAI_API_KEY")
base_url = os.getenv("OPENAI_API_BASE")

embeddings = OpenAIEmbeddings(openai_api_key=api_key, openai_api_base=base_url)

from langchain_chroma import Chroma

vector_store = Chroma.from_documents(
    documents=docs,
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

完整代码

from langchain_text_splitters import MarkdownHeaderTextSplitter

# 读取 Markdown 文档
with open("README.md", "r") as file:
    md_content = file.read()

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
docs = markdown_splitter.split_text(md_content)

import requests
from bs4 import BeautifulSoup
from markdown import markdown
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 转换 Markdown 为 HTML
html_content = markdown(md_content)

# 使用 BeautifulSoup 提取所有链接
soup = BeautifulSoup(html_content, "html.parser")
links = [a["href"] for a in soup.find_all("a", href=True)]
print("Extracted Links:", links)


# 抓取所有链接的网页内容
def fetch_url_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text  # 返回网页 HTML 内容
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None


# 获取所有链接内容
link_contents = []
for link in links:
    content = fetch_url_content(link)
    if content:
        link_contents.append(content)


# 使用 Langchain 进行文本分块
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=100, is_separator_regex=True
)
docs.extend(text_splitter.create_documents(link_contents))

print(f"Total Chunks Created: {len(docs)}")

import os

os.environ["OPENAI_API_BASE"] = "https://api.wlai.vip/v1"
os.environ["OPENAI_API_KEY"] = "sk-dFArvEEXCSPqs8gB6a1056Aa0f104a558d30Ef747976D63b"

from langchain_openai import OpenAIEmbeddings

api_key = os.getenv("OPENAI_API_KEY")
base_url = os.getenv("OPENAI_API_BASE")

embeddings = OpenAIEmbeddings(openai_api_key=api_key, openai_api_base=base_url)

from langchain_chroma import Chroma

vector_store = Chroma.from_documents(
    documents=docs,
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

参考资料

  1. How to: split Markdown by headers
  2. How to: recursively split text
  3. langchain-Chroma