文档例子
需求
把文档中的内容按照标头拆分并建立索引,文档中的超链接也要加载、拆分并建立索引。
处理思路
- 读取 markdown 格式的文件,按照标头进行切分
- 读取其它链接的内容,按照字符数进行切分
- 向量化并存入 Chroma 数据库
- 采用 langchain 框架
读取 markdown 格式的文件,按照标头进行切分
from langchain_text_splitters import MarkdownHeaderTextSplitter
# 读取 Markdown 文档
with open("README.md", "r") as file:
md_content = file.read()
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
docs = markdown_splitter.split_text(md_content)
读取其它链接的内容,按照字符数进行切分
import requests
from bs4 import BeautifulSoup
from markdown import markdown
from langchain.text_splitter import RecursiveCharacterTextSplitter
# 转换 Markdown 为 HTML
html_content = markdown(md_content)
# 使用 BeautifulSoup 提取所有链接
soup = BeautifulSoup(html_content, "html.parser")
links = [a["href"] for a in soup.find_all("a", href=True)]
# 抓取所有链接的网页内容
def fetch_url_content(url):
try:
response = requests.get(url)
response.raise_for_status()
return response.text # 返回网页 HTML 内容
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
# 获取所有链接内容
link_contents = []
for link in links:
content = fetch_url_content(link)
if content:
link_contents.append(content)
# 使用 Langchain 进行文本分块
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, chunk_overlap=100, is_separator_regex=True
)
docs.extend(text_splitter.create_documents(link_contents))
向量化并存入 Chroma 数据库
import os
os.environ["OPENAI_API_BASE"] = "https://api.wlai.vip/v1"
os.environ["OPENAI_API_KEY"] = "sk-dFArvEEXCSPqs8gB6a1056Aa0f104a558d30Ef747976D63b"
from langchain_openai import OpenAIEmbeddings
api_key = os.getenv("OPENAI_API_KEY")
base_url = os.getenv("OPENAI_API_BASE")
embeddings = OpenAIEmbeddings(openai_api_key=api_key, openai_api_base=base_url)
from langchain_chroma import Chroma
vector_store = Chroma.from_documents(
documents=docs,
collection_name="example_collection",
embedding_function=embeddings,
persist_directory="./chroma_langchain_db", # Where to save data locally, remove if not necessary
)
完整代码
from langchain_text_splitters import MarkdownHeaderTextSplitter
# 读取 Markdown 文档
with open("README.md", "r") as file:
md_content = file.read()
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
docs = markdown_splitter.split_text(md_content)
import requests
from bs4 import BeautifulSoup
from markdown import markdown
from langchain.text_splitter import RecursiveCharacterTextSplitter
# 转换 Markdown 为 HTML
html_content = markdown(md_content)
# 使用 BeautifulSoup 提取所有链接
soup = BeautifulSoup(html_content, "html.parser")
links = [a["href"] for a in soup.find_all("a", href=True)]
print("Extracted Links:", links)
# 抓取所有链接的网页内容
def fetch_url_content(url):
try:
response = requests.get(url)
response.raise_for_status()
return response.text # 返回网页 HTML 内容
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return None
# 获取所有链接内容
link_contents = []
for link in links:
content = fetch_url_content(link)
if content:
link_contents.append(content)
# 使用 Langchain 进行文本分块
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, chunk_overlap=100, is_separator_regex=True
)
docs.extend(text_splitter.create_documents(link_contents))
print(f"Total Chunks Created: {len(docs)}")
import os
os.environ["OPENAI_API_BASE"] = "https://api.wlai.vip/v1"
os.environ["OPENAI_API_KEY"] = "sk-dFArvEEXCSPqs8gB6a1056Aa0f104a558d30Ef747976D63b"
from langchain_openai import OpenAIEmbeddings
api_key = os.getenv("OPENAI_API_KEY")
base_url = os.getenv("OPENAI_API_BASE")
embeddings = OpenAIEmbeddings(openai_api_key=api_key, openai_api_base=base_url)
from langchain_chroma import Chroma
vector_store = Chroma.from_documents(
documents=docs,
collection_name="example_collection",
embedding_function=embeddings,
persist_directory="./chroma_langchain_db", # Where to save data locally, remove if not necessary
)