[1339]python crawl4ai基本用法

50 阅读6分钟

@[toc]

github:github.com/unclecode/c…

Crawl4AI 是一个强大的网页爬取和内容提取库,专门为 AI 应用设计。以下是其主要用法和核心功能:

安装

# Install the package
pip install -U crawl4ai

# For pre release versions
pip install crawl4ai --pre

# Run post-installation setup
crawl4ai-setup

# Verify your installation
crawl4ai-doctor

基本用法

1. 简单爬取

from crawl4ai import WebCrawler

with WebCrawler() as crawler:
    result = crawler.arun(url="https://example.com")
    print(result.html)  # 获取完整HTML
    print(result.markdown)  # 获取Markdown格式内容

2. 异步爬取(推荐)

import asyncio
from crawl4ai import WebCrawler

async def main():
    async with WebCrawler() as crawler:
        result = await crawler.arun(
            url="https://example.com",
            chunking_strategy="None",  # 禁用分块
        )
        print(result.markdown)

asyncio.run(main())

命令行用法

# 基础爬取并输出 Markdown
crwl https://www.nbcnews.com/business -o markdown

# 深度爬取,BFS 策略,最多 10 页
crwl https://docs.crawl4ai.com --deep-crawl bfs --max-pages 10

# 调用 LLM 按问题抽取
crwl https://www.example.com/products -q "Extract all product prices"

核心功能

内容分块策略

result = await crawler.arun(
    url="https://example.com",
    chunking_strategy="SemanticChunking",  # 语义分块
    chunking_options={
        "min_chunk_size": 100,
        "max_chunk_size": 1000
    }
)

CSS选择器过滤

result = await crawler.arun(
    url="https://example.com",
    css_selector=".article-content, .main-text"  # 只提取特定元素
)

JavaScript渲染

result = await crawler.arun(
    url="https://example.com",
    js_code="window.scrollTo(0, document.body.scrollHeight);",  # 执行JS
    wait_for=2000  # 等待2秒
)

高级配置

自定义浏览器配置

from crawl4ai import BrowserConfig, CrawlerRunConfig

browser_config = BrowserConfig(
    headless=True,
    browser_type="chromium",
    viewport_width=1920,
    viewport_height=1080
)

run_config = CrawlerRunConfig(
    word_count_threshold=50,
    extraction_strategy="Auto"
)

async with WebCrawler(config=browser_config) as crawler:
    result = await crawler.arun(
        url="https://example.com",
        config=run_config
    )

批量爬取

urls = [
    "https://example.com/page1",
    "https://example.com/page2",
    "https://example.com/page3"
]

async with WebCrawler() as crawler:
    tasks = [crawler.arun(url=url) for url in urls]
    results = await asyncio.gather(*tasks)
    
    for result in results:
        print(f"URL: {result.url}")
        print(f"内容长度: {len(result.markdown)}")

内容处理

访问提取的内容

result = await crawler.arun(url="https://example.com")

# 原始HTML
html_content = result.html

# Markdown格式
markdown_content = result.markdown

# 分块内容(如果启用分块)
chunks = result.chunks

# 元数据
metadata = result.metadata

错误处理

try:
    result = await crawler.arun(
        url="https://example.com",
        timeout=30000  # 30秒超时
    )
    if result.success:
        print("爬取成功")
    else:
        print(f"错误: {result.error_message}")
except Exception as e:
    print(f"异常: {e}")

实际应用示例

新闻文章提取

async def extract_news_article(url):
    async with WebCrawler() as crawler:
        result = await crawler.arun(
            url=url,
            css_selector=".article-body, .news-content, main",
            chunking_strategy="SemanticChunking",
            chunking_options={
                "min_chunk_size": 200,
                "max_chunk_size": 800
            }
        )
        return {
            "url": url,
            "title": result.metadata.get("title", ""),
            "content": result.markdown,
            "chunks": result.chunks
        }

Crawl4AI 特别适合需要高质量文本提取的AI应用,如RAG系统、内容分析等。其智能分块和内容清理功能可以显著提升后续AI处理的效果。


crawl4ai解析方法

crawl4ai有三种解析方法:修剪内容过滤器(PruningContentFilter),BM25内容过滤器,LLM内容过滤器(LLMContentFilter)

由于LLM需要调用API有成本,且响应慢,这里优先介绍Pruning和BM25

经过测试BM25对中文效果不好,对英文内容搜索更好,所以推荐使用Pruning

1.Pruning(修剪)

修剪,自动分析内容重要度,只保留重要的内容 适合AI知识库构建

import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 
"""
Pruning: 修剪,只保留重要的内容,自动分析重要度
在Crawl4AI中,Pruning是通过DefaultMarkdownGenerator类实现的。
options:
    ignore_links: 是否在最终markdown中移除所有超链接
    ignore_images: 是否在最终markdown中移除所有图片
"""
 
async def main():
    browser_config = BrowserConfig(
        headless=True, # 是否无头模式,True:不打卡浏览器
        viewport_width=1280, # 视口宽度
        viewport_height=720, # 视口高度
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", # 用户代理
        text_mode=True, # 文本模式,禁用图片加载
    )  # 浏览器配置
    run_config = CrawlerRunConfig(
        cache_mode=CacheMode.DISABLED, # 禁用缓存模式,获取最新内容
        markdown_generator = DefaultMarkdownGenerator(
            content_filter=PruningContentFilter(
                # min_word_threshold = 10, # 丢掉少于10个单词的块,因为他们可能太短或者无用
                threshold = 0.76, # 丢掉重要度低于0.76的块,越高过滤越严格
                threshold_type = "fixed", # 重要度阈值类型,fixed:固定值,dynamic:相对值
                # threshold_type = "dynamic"
            ),
            options = {
                "ignore_links": True, # 是否在最终markdown中移除所有超链接
                "ignore_images": True, # 是否在最终markdown中移除所有图片
            }
        )
    )  # 爬虫运行配置
 
    async with AsyncWebCrawler(config=browser_config) as crawler:
        try:
            result = await crawler.arun(
                url="https://www.anthropic.com/news/agent-capabilities-api",
                config=run_config
            )
        except Exception as e:
            print(f"错误:{e}")
        with open(f"2.2.2result-{len(result.markdown.fit_markdown)}.md", "w", encoding="utf-8") as f:
            f.write(result.markdown.fit_markdown)
        print(f"内容长度:{len(result.markdown.fit_markdown)}")
        print(f"已保存在:{f.name}")
 
if __name__ == "__main__":
    asyncio.run(main())

2.BM25

向量检索,根据用户问题生成向量,然后根据向量检索文章相似内容,并返回文章相似内容片段 适合AI+联网搜索 经过测试BM25对中文效果不好,对英文内容搜索更好,所以推荐使用Pruning

import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.content_filter_strategy import BM25ContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 
"""
bm25: 向量检索,根据内容生成向量,然后根据向量检索相似内容
"""
 
async def main():
    browser_config = BrowserConfig(
        headless=True, # 是否无头模式,True:不打卡浏览器
        viewport_width=1280, # 视口宽度
        viewport_height=720, # 视口高度
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", # 用户代理
        text_mode=True, # 文本模式,禁用图片加载
    )  # 浏览器配置
    run_config = CrawlerRunConfig(
        cache_mode=CacheMode.DISABLED, # 禁用缓存模式,获取最新内容
        markdown_generator = DefaultMarkdownGenerator(
            content_filter=BM25ContentFilter(
                user_query="Anthropic API",
                bm25_threshold=1.2, #数值越高块越少
            ),
            options = {
                "ignore_links": True, # 是否在最终markdown中移除所有超链接
                "ignore_images": True, # 是否在最终markdown中移除所有图片
            }
        )
    )  # 爬虫运行配置
 
    async with AsyncWebCrawler(config=browser_config) as crawler:
        try:
            result = await crawler.arun(
                url="https://www.anthropic.com/news/agent-capabilities-api",
                config=run_config
            )
        except Exception as e:
            print(f"错误:{e}")
        with open(f"2.2.3result-{len(result.markdown.fit_markdown)}.md", "w", encoding="utf-8") as f:
            f.write(result.markdown.fit_markdown)
        print(f"内容长度:{len(result.markdown.fit_markdown)}")
        print(f"已保存在:{f.name}")
 
if __name__ == "__main__":
    asyncio.run(main())

3.并发url请求爬取(arun_many)

import asyncio
import os
import re
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.content_filter_strategy import BM25ContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 
"""
bm25: 向量检索,根据内容生成向量,然后根据向量检索相似内容
"""
 
async def main():
    browser_config = BrowserConfig(
        headless=True, # 是否无头模式,True:不打卡浏览器
        viewport_width=1280, # 视口宽度
        viewport_height=720, # 视口高度
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", # 用户代理
        text_mode=True, # 文本模式,禁用图片加载
    )  # 浏览器配置
    run_config = CrawlerRunConfig(
        cache_mode=CacheMode.DISABLED, # 禁用缓存模式,获取最新内容
        markdown_generator = DefaultMarkdownGenerator(
            content_filter=BM25ContentFilter(
                user_query="Anthropic API",
                bm25_threshold=1.2,
            ),
            options = {
                "ignore_links": True, # 是否在最终markdown中移除所有超链接
                "ignore_images": True, # 是否在最终markdown中移除所有图片
            }
        )
    )  # 爬虫运行配置
 
    async with AsyncWebCrawler(config=browser_config) as crawler:
        try:
            results = await crawler.arun_many(
                urls=[
                    "https://www.anthropic.com/news/agent-capabilities-api",
                    "https://www.anthropic.com/news/claude-4",
                    "https://www.anthropic.com/news/the-anthropic-economic-index"
                ],
                config=run_config
            )
        except Exception as e:
            print(f"错误:{e}")
            return
            
        # 创建输出目录(如果不存在)
        output_dir = "anthropic_articles"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        for result in results:
            # 从URL生成文件名
            filename = re.sub(r'https?://(?:www\.)?anthropic\.com/news/', '', result.url)
            filename = re.sub(r'[^\w\-]', '_', filename)  # 替换非字母数字字符为下划线
            filepath = os.path.join(output_dir, f"{filename}-{len(result.markdown.fit_markdown)}.md")
            
            # 保存到markdown文件
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(result.markdown.fit_markdown)
                
            print(f"已保存: {filepath}")
 
 
if __name__ == "__main__":
    asyncio.run(main())