@[toc]
github:github.com/unclecode/c…
Crawl4AI 是一个强大的网页爬取和内容提取库,专门为 AI 应用设计。以下是其主要用法和核心功能:
安装
# Install the package
pip install -U crawl4ai
# For pre release versions
pip install crawl4ai --pre
# Run post-installation setup
crawl4ai-setup
# Verify your installation
crawl4ai-doctor
基本用法
1. 简单爬取
from crawl4ai import WebCrawler
with WebCrawler() as crawler:
result = crawler.arun(url="https://example.com")
print(result.html) # 获取完整HTML
print(result.markdown) # 获取Markdown格式内容
2. 异步爬取(推荐)
import asyncio
from crawl4ai import WebCrawler
async def main():
async with WebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
chunking_strategy="None", # 禁用分块
)
print(result.markdown)
asyncio.run(main())
命令行用法
# 基础爬取并输出 Markdown
crwl https://www.nbcnews.com/business -o markdown
# 深度爬取,BFS 策略,最多 10 页
crwl https://docs.crawl4ai.com --deep-crawl bfs --max-pages 10
# 调用 LLM 按问题抽取
crwl https://www.example.com/products -q "Extract all product prices"
核心功能
内容分块策略
result = await crawler.arun(
url="https://example.com",
chunking_strategy="SemanticChunking", # 语义分块
chunking_options={
"min_chunk_size": 100,
"max_chunk_size": 1000
}
)
CSS选择器过滤
result = await crawler.arun(
url="https://example.com",
css_selector=".article-content, .main-text" # 只提取特定元素
)
JavaScript渲染
result = await crawler.arun(
url="https://example.com",
js_code="window.scrollTo(0, document.body.scrollHeight);", # 执行JS
wait_for=2000 # 等待2秒
)
高级配置
自定义浏览器配置
from crawl4ai import BrowserConfig, CrawlerRunConfig
browser_config = BrowserConfig(
headless=True,
browser_type="chromium",
viewport_width=1920,
viewport_height=1080
)
run_config = CrawlerRunConfig(
word_count_threshold=50,
extraction_strategy="Auto"
)
async with WebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://example.com",
config=run_config
)
批量爬取
urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3"
]
async with WebCrawler() as crawler:
tasks = [crawler.arun(url=url) for url in urls]
results = await asyncio.gather(*tasks)
for result in results:
print(f"URL: {result.url}")
print(f"内容长度: {len(result.markdown)}")
内容处理
访问提取的内容
result = await crawler.arun(url="https://example.com")
# 原始HTML
html_content = result.html
# Markdown格式
markdown_content = result.markdown
# 分块内容(如果启用分块)
chunks = result.chunks
# 元数据
metadata = result.metadata
错误处理
try:
result = await crawler.arun(
url="https://example.com",
timeout=30000 # 30秒超时
)
if result.success:
print("爬取成功")
else:
print(f"错误: {result.error_message}")
except Exception as e:
print(f"异常: {e}")
实际应用示例
新闻文章提取
async def extract_news_article(url):
async with WebCrawler() as crawler:
result = await crawler.arun(
url=url,
css_selector=".article-body, .news-content, main",
chunking_strategy="SemanticChunking",
chunking_options={
"min_chunk_size": 200,
"max_chunk_size": 800
}
)
return {
"url": url,
"title": result.metadata.get("title", ""),
"content": result.markdown,
"chunks": result.chunks
}
Crawl4AI 特别适合需要高质量文本提取的AI应用,如RAG系统、内容分析等。其智能分块和内容清理功能可以显著提升后续AI处理的效果。
crawl4ai解析方法
crawl4ai有三种解析方法:修剪内容过滤器(PruningContentFilter),BM25内容过滤器,LLM内容过滤器(LLMContentFilter)
由于LLM需要调用API有成本,且响应慢,这里优先介绍Pruning和BM25
经过测试BM25对中文效果不好,对英文内容搜索更好,所以推荐使用Pruning
1.Pruning(修剪)
修剪,自动分析内容重要度,只保留重要的内容 适合AI知识库构建
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
"""
Pruning: 修剪,只保留重要的内容,自动分析重要度
在Crawl4AI中,Pruning是通过DefaultMarkdownGenerator类实现的。
options:
ignore_links: 是否在最终markdown中移除所有超链接
ignore_images: 是否在最终markdown中移除所有图片
"""
async def main():
browser_config = BrowserConfig(
headless=True, # 是否无头模式,True:不打卡浏览器
viewport_width=1280, # 视口宽度
viewport_height=720, # 视口高度
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", # 用户代理
text_mode=True, # 文本模式,禁用图片加载
) # 浏览器配置
run_config = CrawlerRunConfig(
cache_mode=CacheMode.DISABLED, # 禁用缓存模式,获取最新内容
markdown_generator = DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
# min_word_threshold = 10, # 丢掉少于10个单词的块,因为他们可能太短或者无用
threshold = 0.76, # 丢掉重要度低于0.76的块,越高过滤越严格
threshold_type = "fixed", # 重要度阈值类型,fixed:固定值,dynamic:相对值
# threshold_type = "dynamic"
),
options = {
"ignore_links": True, # 是否在最终markdown中移除所有超链接
"ignore_images": True, # 是否在最终markdown中移除所有图片
}
)
) # 爬虫运行配置
async with AsyncWebCrawler(config=browser_config) as crawler:
try:
result = await crawler.arun(
url="https://www.anthropic.com/news/agent-capabilities-api",
config=run_config
)
except Exception as e:
print(f"错误:{e}")
with open(f"2.2.2result-{len(result.markdown.fit_markdown)}.md", "w", encoding="utf-8") as f:
f.write(result.markdown.fit_markdown)
print(f"内容长度:{len(result.markdown.fit_markdown)}")
print(f"已保存在:{f.name}")
if __name__ == "__main__":
asyncio.run(main())
2.BM25
向量检索,根据用户问题生成向量,然后根据向量检索文章相似内容,并返回文章相似内容片段 适合AI+联网搜索 经过测试BM25对中文效果不好,对英文内容搜索更好,所以推荐使用Pruning
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.content_filter_strategy import BM25ContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
"""
bm25: 向量检索,根据内容生成向量,然后根据向量检索相似内容
"""
async def main():
browser_config = BrowserConfig(
headless=True, # 是否无头模式,True:不打卡浏览器
viewport_width=1280, # 视口宽度
viewport_height=720, # 视口高度
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", # 用户代理
text_mode=True, # 文本模式,禁用图片加载
) # 浏览器配置
run_config = CrawlerRunConfig(
cache_mode=CacheMode.DISABLED, # 禁用缓存模式,获取最新内容
markdown_generator = DefaultMarkdownGenerator(
content_filter=BM25ContentFilter(
user_query="Anthropic API",
bm25_threshold=1.2, #数值越高块越少
),
options = {
"ignore_links": True, # 是否在最终markdown中移除所有超链接
"ignore_images": True, # 是否在最终markdown中移除所有图片
}
)
) # 爬虫运行配置
async with AsyncWebCrawler(config=browser_config) as crawler:
try:
result = await crawler.arun(
url="https://www.anthropic.com/news/agent-capabilities-api",
config=run_config
)
except Exception as e:
print(f"错误:{e}")
with open(f"2.2.3result-{len(result.markdown.fit_markdown)}.md", "w", encoding="utf-8") as f:
f.write(result.markdown.fit_markdown)
print(f"内容长度:{len(result.markdown.fit_markdown)}")
print(f"已保存在:{f.name}")
if __name__ == "__main__":
asyncio.run(main())
3.并发url请求爬取(arun_many)
import asyncio
import os
import re
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.content_filter_strategy import BM25ContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
"""
bm25: 向量检索,根据内容生成向量,然后根据向量检索相似内容
"""
async def main():
browser_config = BrowserConfig(
headless=True, # 是否无头模式,True:不打卡浏览器
viewport_width=1280, # 视口宽度
viewport_height=720, # 视口高度
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", # 用户代理
text_mode=True, # 文本模式,禁用图片加载
) # 浏览器配置
run_config = CrawlerRunConfig(
cache_mode=CacheMode.DISABLED, # 禁用缓存模式,获取最新内容
markdown_generator = DefaultMarkdownGenerator(
content_filter=BM25ContentFilter(
user_query="Anthropic API",
bm25_threshold=1.2,
),
options = {
"ignore_links": True, # 是否在最终markdown中移除所有超链接
"ignore_images": True, # 是否在最终markdown中移除所有图片
}
)
) # 爬虫运行配置
async with AsyncWebCrawler(config=browser_config) as crawler:
try:
results = await crawler.arun_many(
urls=[
"https://www.anthropic.com/news/agent-capabilities-api",
"https://www.anthropic.com/news/claude-4",
"https://www.anthropic.com/news/the-anthropic-economic-index"
],
config=run_config
)
except Exception as e:
print(f"错误:{e}")
return
# 创建输出目录(如果不存在)
output_dir = "anthropic_articles"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for result in results:
# 从URL生成文件名
filename = re.sub(r'https?://(?:www\.)?anthropic\.com/news/', '', result.url)
filename = re.sub(r'[^\w\-]', '_', filename) # 替换非字母数字字符为下划线
filepath = os.path.join(output_dir, f"{filename}-{len(result.markdown.fit_markdown)}.md")
# 保存到markdown文件
with open(filepath, 'w', encoding='utf-8') as f:
f.write(result.markdown.fit_markdown)
print(f"已保存: {filepath}")
if __name__ == "__main__":
asyncio.run(main())