以下Python 3代码中的parse_baidu_results()
函数的返回值是一个字典,键是网址,值是(网页标题,网页内容)。
user_agent变量请直接打开浏览器,在百度中随便搜点什么,打开开发者工具中的Network,刷新,在出现的变量中选择按Size逆序排序,选择第一个变量的User Agent值复制进去就可以。
import sys
import time
import requests
import os
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
TOTAL_TIMEOUT = 6
NUM_SEARCH = 20
SEARCH_TIME_LIMIT = 3
def trace_function_factory(start):
"""创建超时请求的跟踪函数"""
def trace_function(frame, event, arg):
if time.time() - start > TOTAL_TIMEOUT:
raise TimeoutError("网站获取超时")
return trace_function
return trace_function
def fetch_webpage(url, timeout):
"""获取给定URL和超时的网页内容。"""
start = time.time()
sys.settrace(trace_function_factory(start))
try:
response = requests.get(url, timeout=timeout)
response.raise_for_status()
response.encoding = "utf-8" # 确保使用UTF-8编码
soup = BeautifulSoup(response.text, "lxml")
title = soup.title.string if soup.title else "无标题"
paragraphs = soup.find_all("p")
page_text = " ".join([para.get_text() for para in paragraphs])
return url, title, page_text
except (requests.exceptions.RequestException, TimeoutError) as e:
print(f"获取 {url} 时出错: {e}", file=sys.stderr)
finally:
sys.settrace(None)
return url, None, None
def baidu_search(query, num_results=10):
"""执行百度搜索并返回结果"""
print(f"执行百度搜索: {query}") # 添加调试信息
headers = {
"User-Agent": user_agent
}
url = f"https://www.baidu.com/s?wd={query}&rn={num_results}"
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
print(f"百度搜索状态码: {response.status_code}") # 添加状态码调试信息
soup = BeautifulSoup(response.text, "html.parser")
search_results = []
for result in soup.select(".result.c-container"):
link = result.select_one("h3.t a")
if link and "href" in link.attrs:
search_results.append(link["href"])
print(f"百度搜索结果数量: {len(search_results)}") # 添加调试信息
if len(search_results) == 0:
print("警告:没有找到搜索结果。HTML内容:")
print(response.text[:1000]) # 打印前1000个字符的HTML内容
return search_results[:num_results]
except requests.RequestException as e:
print(f"百度搜索请求失败: {e}")
return []
def parse_baidu_results(
query, num_search=NUM_SEARCH, search_time_limit=SEARCH_TIME_LIMIT
):
"""执行百度搜索并解析顶部结果的内容。"""
print(f"开始搜索: {query}") # 添加调试信息
urls = baidu_search(query, num_search)
print(f"搜索到的URL数量: {len(urls)}") # 添加调试信息
max_workers = os.cpu_count() or 1
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_url = {
executor.submit(fetch_webpage, url, search_time_limit): url for url in urls
}
results = {
url: (title, page_text)
for future in as_completed(future_to_url)
if (url := future.result()[0])
and (title := future.result()[1])
and (page_text := future.result()[2])
}
print(f"成功获取的网页数量: {len(results)}") # 添加调试信息
return results
print(parse_baidu_results("Python"))