技术架构 mermaid Copy Code graph TD A[爬虫调度中心] --> B(请求频率控制) B --> C{页面类型} C -->|列表页| D[HTML解析器] C -->|详情页| E[动态渲染引擎] D --> F[数据清洗模块] E --> F F --> G[结构化存储]
点击获取key和secret
核心代码实现(Python)
python
Copy Code
import requests
from bs4 import BeautifulSoup
import time
import json
from urllib.parse import urljoin
class XJBCrawler: def init(self): self.base_url = "www.bjnews.com.cn" self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ResearchBot/1.0', 'Referer': self.base_url } self.session = requests.Session()
def fetch_news_list(self, channel='china', page=1):
"""
获取新闻列表数据
:param channel: 新闻频道(china/society/economy...)
:param page: 分页数
:return: JSON结构化数据
"""
try:
# 分页逻辑处理
url = f"{self.base_url}/{channel}/"
params = {'page': page} if page > 1 else {}
# 带超时设置的请求
response = self.session.get(url, headers=self.headers, params=params, timeout=10)
response.encoding = 'utf-8'
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')
news_items = []
# 新版DOM解析逻辑
for article in soup.select('div.article-list > div.list-item'):
title_elem = article.select_one('h3 > a')
time_elem = article.select_one('div.info > span.date')
# 数据完整性校验
if not (title_elem and time_elem):
continue
news_item = {
"title": title_elem.text.strip(),
"url": urljoin(self.base_url, title_elem['href']),
"publish_time": time_elem.text.strip(),
"abstract": article.select_one('div.summary').text.strip()[:100],
"channel": channel.upper()
}
news_items.append(news_item)
return {
"status": 200,
"data": news_items,
"page_info": {
"current_page": page,
"next_page": page + 1 if len(news_items)>=20 else None
}
}
return {"status": response.status_code, "error": "请求失败"}
except Exception as e:
return {"status": 500, "error": str(e)}
def get_news_detail(self, url):
"""获取新闻详情数据"""
# 实现动态渲染及反爬绕过策略
pass
使用示例
if name == 'main': crawler = XJBCrawler()
# 获取国内新闻第一页
result = crawler.fetch_news_list(channel='china', page=1)
print(json.dumps(result, indent=2, ensure_ascii=False))
# 遵守爬虫道德规范
time.sleep(3) # 请求间隔控制
数据返回示例 json Copy Code { "status": 200, "data": [ { "title": "京津冀协同发展十年成果报告发布", "url": "www.bjnews.com.cn/china/2024/…", "publish_time": "2024-03-05 09:30:00", "abstract": "北京市发改委公布最新统计数据显示...", "channel": "CHINA" }, { "title": "北京城市副中心启动智慧交通改造", "url": "www.bjnews.com.cn/china/2024/…", "publish_time": "2024-03-05 10:15:00", "abstract": "副中心将部署2000个智能信号灯...", "channel": "CHINA" } ], "page_info": { "current_page": 1, "next_page": 2 } }
关键技术说明
反爬策略应对
IP轮换机制 Headers动态生成 请求频率控制(建议<5次/分钟)
数据完整性保障
字段有效性校验 自动重试机制 异常日志记录
法律合规要点
严格遵守网站robots.txt限制 禁止商业性数据复制 数据存储不超过24小时
特别提示:建议优先通过新京报官方合作渠道(open.bjnews.com.cn)获取授权,本方案仅用于教育科研用途。