以下是将所有平台整合后的代码文件,包含必要的依赖和配置说明:
import requests
from bs4 import BeautifulSoup
import json
import re
import time
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from urllib.parse import quote
# ================== 全局配置 ==================
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
# ================== 平台抓取函数 ==================
def fetch_weibo_hot():
"""微博热搜"""
try:
url = 'https://weibo.com/ajax/side/hotSearch'
response = requests.get(url, headers=HEADERS, timeout=10)
data = response.json()['data']['realtime']
return [{'title': item['word'], 'url': f"https://s.weibo.com/weibo?q={item['word']}"} for item in data]
except Exception as e:
print(f"微博热搜抓取失败: {str(e)}")
return []
def fetch_zhihu_hot():
"""知乎热榜"""
try:
url = 'https://www.zhihu.com/billboard'
response = requests.get(url, headers=HEADERS, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
script_data = soup.find('script', id='js-initialData').string
json_data = json.loads(script_data)
hot_list = json_data['initialState']['topstory']['hotList']
return [{
'title': item['target']['titleArea']['text'],
'url': item['target']['link']['url']
} for item in hot_list]
except Exception as e:
print(f"知乎热榜抓取失败: {str(e)}")
return []
def fetch_baidu_hot():
"""百度热搜"""
try:
url = 'https://top.baidu.com/board?tab=realtime'
response = requests.get(url, headers=HEADERS, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.find_all('div', class_='category-wrap_iQLoo')
return [{
'title': item.find('div', class_='c-single-text-ellipsis').text.strip(),
'url': item.find('a')['href']
} for item in items]
except Exception as e:
print(f"百度热搜抓取失败: {str(e)}")
return []
def fetch_weixin_hot():
"""微信热点(通过搜狗)"""
try:
url = 'https://weixin.sogou.com/'
response = requests.get(url, headers=HEADERS, timeout=15)
soup = BeautifulSoup(response.text, 'html.parser')
articles = []
for item in soup.select('#hotBox li'):
title_tag = item.find('a', href=re.compile(r'^http://mp.weixin.qq.com'))
if title_tag:
articles.append({
'title': title_tag.text.strip(),
'url': title_tag['href']
})
return articles[:10]
except Exception as e:
print(f"微信热点抓取失败: {str(e)}")
return []
def fetch_juejin_hot():
"""掘金热榜"""
try:
url = 'https://api.juejin.cn/content_api/v1/content/article_rank?category_id=1&type=hot'
response = requests.post(url, headers=HEADERS, timeout=10)
return [{
'title': item['content']['title'],
'url': f"https://juejin.cn/post/{item['content']['content_id']}"
} for item in response.json()['data'][:20]]
except Exception as e:
print(f"掘金热榜抓取失败: {str(e)}")
return []
def fetch_csdn_hot():
"""CSDN热榜"""
try:
url = 'https://blog.csdn.net/phoenix/web/blog/hot-rank?page=0&pageSize=25'
headers = HEADERS.copy()
headers['Referer'] = 'https://blog.csdn.net/'
response = requests.get(url, headers=headers, timeout=10)
return [{
'title': item['articleTitle'],
'url': item['articleDetailUrl'],
'heat': item['hotRankScore']
} for item in response.json()['data'][:15]]
except Exception as e:
print(f"CSDN热榜抓取失败: {str(e)}")
return []
def fetch_toutiao_hot():
"""今日头条热榜(Selenium方案)"""
try:
options = Options()
options.add_argument("--headless")
options.add_argument(f"user-agent={HEADERS['User-Agent']}")
driver = webdriver.Chrome(options=options)
driver.get("https://www.toutiao.com/")
time.sleep(5)
hot_items = driver.find_elements(By.XPATH, '//div[contains(@class,"hot-list-item")]')
results = []
for item in hot_items[:20]:
results.append({
'title': item.find_element(By.CLASS_NAME, 'title').text,
'url': item.find_element(By.TAG_NAME, 'a').get_attribute('href')
})
driver.quit()
return results
except Exception as e:
print(f"今日头条抓取失败: {str(e)}")
return []
# ================== 核心逻辑 ==================
def save_to_json(data, filename='hot_data.json'):
"""保存数据到JSON文件"""
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def main():
platforms = {
'微博热搜': fetch_weibo_hot,
'知乎热榜': fetch_zhihu_hot,
'百度热搜': fetch_baidu_hot,
'微信热点': fetch_weixin_hot,
'掘金热榜': fetch_juejin_hot,
'CSDN热榜': fetch_csdn_hot,
'今日头条': fetch_toutiao_hot
}
all_data = {}
for name, func in platforms.items():
print(f'\n====== 正在抓取 {name} ======')
start_time = time.time()
all_data[name] = func()
cost = time.time() - start_time
print(f"完成!耗时{cost:.2f}秒,获取到{len(all_data[name])}条数据")
time.sleep(random.randint(2, 5)) # 随机延迟
# 保存数据
save_to_json(all_data)
print('\n===== 数据已保存到 hot_data.json =====')
# 打印摘要
print('\n===== 最新热榜摘要 =====')
for platform, items in all_data.items():
print(f'\n▶ {platform}(共{len(items)}条):')
for idx, item in enumerate(items[:3], 1):
print(f" {idx}. {item['title'][:30]}...")
if 'heat' in item:
print(f" 热度:{item['heat']} 链接:{item['url'][:50]}...")
else:
print(f" 链接:{item['url'][:50]}...")
if __name__ == '__main__':
main()
使用说明
-
环境准备:
# 安装依赖库 pip install requests beautifulsoup4 selenium webdriver-manager -
浏览器驱动配置:
- 安装Chrome浏览器
- 根据Chrome版本下载对应chromedriver
- 将驱动文件放在系统PATH路径或代码同级目录
-
运行程序:
python hotlist_crawler.py
功能扩展
添加新平台:
def fetch_新平台_hot():
# 实现抓取逻辑
return [{'title':..., 'url':...}]
# 更新platforms字典
platforms = {
...
'新平台名称': fetch_新平台_hot
}
输出示例
控制台输出:
====== 正在抓取 微博热搜 ======
完成!耗时1.23秒,获取到50条数据
====== 正在抓取 知乎热榜 ======
完成!耗时2.15秒,获取到50条数据
...
===== 数据已保存到 hot_data.json =====
===== 最新热榜摘要 =====
▶ 微博热搜(共50条):
1. #神舟十八号发射成功#...
链接:https://s.weibo.com/weibo?q=%23%E7%A5%9E%E...
2. #五一假期出行预测#...
链接:https://s.weibo.com/weibo?q=%23%E4%BA%94%...
▶ 今日头条(共20条):
1. 国际油价创年内新高...
链接:https://www.toutiao.com/trending/723456...
注意事项
-
反爬策略:
- 在
HEADERS中添加有效的Cookie - 使用代理IP池(推荐使用付费代理服务)
# 在请求时添加代理 response = requests.get(url, proxies={'http': 'http://ip:port'}) - 在
-
性能优化:
# 启用多线程(示例) from concurrent.futures import ThreadPoolExecutor with ThreadPoolExecutor(max_workers=3) as executor: futures = {executor.submit(func): name for name, func in platforms.items()} for future in as_completed(futures): name = futures[future] all_data[name] = future.result() -
定时任务:
# 使用APScheduler定时执行 from apscheduler.schedulers.blocking import BlockingScheduler scheduler = BlockingScheduler() @scheduler.scheduled_job('interval', hours=1) def scheduled_job(): main() scheduler.start()
建议根据实际需求调整请求频率和反爬策略,完整代码已包含主流平台的热榜抓取功能,可直接运行或二次开发。