从0开始的爬虫实践项目 (4):用百度搜索,然后检索搜索结果网址中的信息

8 阅读2分钟

诸神缄默不语-个人CSDN博文目录

以下Python 3代码中的parse_baidu_results()函数的返回值是一个字典,键是网址,值是(网页标题,网页内容)。
user_agent变量请直接打开浏览器,在百度中随便搜点什么,打开开发者工具中的Network,刷新,在出现的变量中选择按Size逆序排序,选择第一个变量的User Agent值复制进去就可以。

import sys
import time
import requests
import os
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

TOTAL_TIMEOUT = 6
NUM_SEARCH = 20
SEARCH_TIME_LIMIT = 3


def trace_function_factory(start):
    """创建超时请求的跟踪函数"""

    def trace_function(frame, event, arg):
        if time.time() - start > TOTAL_TIMEOUT:
            raise TimeoutError("网站获取超时")
        return trace_function

    return trace_function


def fetch_webpage(url, timeout):
    """获取给定URL和超时的网页内容。"""
    start = time.time()
    sys.settrace(trace_function_factory(start))
    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()
        response.encoding = "utf-8"  # 确保使用UTF-8编码
        soup = BeautifulSoup(response.text, "lxml")
        title = soup.title.string if soup.title else "无标题"
        paragraphs = soup.find_all("p")
        page_text = " ".join([para.get_text() for para in paragraphs])
        return url, title, page_text
    except (requests.exceptions.RequestException, TimeoutError) as e:
        print(f"获取 {url} 时出错: {e}", file=sys.stderr)
    finally:
        sys.settrace(None)
    return url, None, None


def baidu_search(query, num_results=10):
    """执行百度搜索并返回结果"""
    print(f"执行百度搜索: {query}")  # 添加调试信息
    headers = {
        "User-Agent": user_agent
    }
    url = f"https://www.baidu.com/s?wd={query}&rn={num_results}"
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        print(f"百度搜索状态码: {response.status_code}")  # 添加状态码调试信息
        soup = BeautifulSoup(response.text, "html.parser")
        search_results = []
        for result in soup.select(".result.c-container"):
            link = result.select_one("h3.t a")
            if link and "href" in link.attrs:
                search_results.append(link["href"])
        print(f"百度搜索结果数量: {len(search_results)}")  # 添加调试信息
        if len(search_results) == 0:
            print("警告:没有找到搜索结果。HTML内容:")
            print(response.text[:1000])  # 打印前1000个字符的HTML内容
        return search_results[:num_results]
    except requests.RequestException as e:
        print(f"百度搜索请求失败: {e}")
        return []


def parse_baidu_results(
    query, num_search=NUM_SEARCH, search_time_limit=SEARCH_TIME_LIMIT
):
    """执行百度搜索并解析顶部结果的内容。"""
    print(f"开始搜索: {query}")  # 添加调试信息
    urls = baidu_search(query, num_search)
    print(f"搜索到的URL数量: {len(urls)}")  # 添加调试信息
    max_workers = os.cpu_count() or 1
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_url = {
            executor.submit(fetch_webpage, url, search_time_limit): url for url in urls
        }
        results = {
            url: (title, page_text)
            for future in as_completed(future_to_url)
            if (url := future.result()[0])
            and (title := future.result()[1])
            and (page_text := future.result()[2])
        }
    print(f"成功获取的网页数量: {len(results)}")  # 添加调试信息
    return results


print(parse_baidu_results("Python"))

参考资料:飞桨AI Studio星河社区-人工智能学习与实训社区