环境准备
- 技术:python
- 第三方库:requests、pandas
- 获取的数据:36kr_newsflash.csv
接口分析Url:
https://gateway.36kr.com/api/mis/nav/newsflash/list
使用浏览器F2开发者模式中,请求的参数:
{
'partner_id': 'web',
'timestamp': timestamp,
'param': {
'pageSize': 20,
'pageEvent': 1,
'pageCallback': 'eyJmaXJzdElkIjozNjY4NDUwMDA5MDMxNTU3LCJsYXN0SWQiOjM2NjgzNTU4ODU3ODU3MzQsImZpcnN0Q3JlYXRlVGltZSI6MTc3MDE3NjQxOTQ5NiwibGFzdENyZWF0ZVRpbWUiOjE3NzAxNzA2NzQ2NjksImxhc3RQYXJhbSI6IjEifQ',
'siteId': 1,
'type': 0,
'platformId': 2,
},
}
分析加密字段pageCallback: 在每次请求时都会返回一个相应的pageCallback,此字段是数据为后端生成,应用于下次的接力请求; timestamp:为时间戳
对爬虫进行伪装:
headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Origin': 'https://www.36kr.com',
'Pragma': 'no-cache',
'Referer': 'https://www.36kr.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not(A:Brand";v="8", "Chromium";v="144", "Google Chrome";v="144"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
cookies = {
'sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%2219c0885b730376-0a8736ac2602898-26061d51-2073600-19c0885b7311414%22%2C%22%24device_id%22%3A%2219c0885b730376-0a8736ac2602898-26061d51-2073600-19c0885b7311414%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24errer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24lates2%3AAA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D',
}
使用pandas 进行数据存储:
pdf_data = {
'itemId': itemId,
'itemType': itemType,
'templateType': templateType,
'widgetImage': widgetImage,
'publishTime': publishTime,
'widgetTitle': widgetTitle,
'widgetContent': widgetContent,
'route': route,
'siteId': siteId,
}
df = pd.DataFrame([pdf_data]);
df.to_csv('36kr_newsflash.csv', encoding='utf-8', mode='a', index=False, header=not pd.io.common.file_exists('36_newsflash.csv')) # pyright: ignore[reportAttributeAccessIssue]
完整代码:
import requests
import time
import pandas as pd
cookies = {
'sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%2219c0885b730376-0a8736ac2602898-26061d51-2073600-19c0885b73%22%3A%2219c0885b730376-0a8736ac2602898-26061d51-2073600-19c0885b7311414%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%er%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D',
}
headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'Origin': 'https://www.36kr.com',
'Pragma': 'no-cache',
'Referer': 'https://www.36kr.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not(A:Brand";v="8", "Chromium";v="144", "Google Chrome";v="144"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
timestamp = int(time.time() * 1000)
json_data = {
'partner_id': 'web',
'timestamp': timestamp,
'param': {
'pageSize': 20,
'pageEvent': 1,
'pageCallback': 'eyJmaXJzdElkIjozNjY4NDUwMDA5MDMxNTU3LCJsYXN0SWQiOjM2NjgzNTU4ODU3ODU3MzQsImZpcnN0Q3JlYXRlVGltZSI6MTc3MDE3NjQxOTQ5NiwibGFzdENyZWF0ZVRpbWUiOjE3NzAxNzA2NzQ2NjksImxhc3RQYXJhbSI6IjEifQ',
'siteId': 1,
'type': 0,
'platformId': 2,
},
}
response = requests.post('https://gateway.36kr.com/api/mis/nav/newsflash/list', cookies=cookies, headers=headers, json=json_data)
data:list=response.json().get('data').get('itemList')
pageCallback=response.json().get('data').get('pageCallback')
hasNextPage=response.json().get('data').get('hasNextPage')
print(f"pageCallback: {pageCallback}, hasNextPage: {hasNextPage}")
for item in data:
itemId = item.get('itemId')
itemType = item.get('itemType')
itempPlateMaterial = item.get('templateMaterial')
templateType = itempPlateMaterial.get('templateType')
widgetImage=itempPlateMaterial.get('widgetImage')
publishTime=itempPlateMaterial.get('publishTime')
widgetTitle=itempPlateMaterial.get('widgetTitle')
widgetContent=itempPlateMaterial.get('widgetContent')
route=item.get('route')
siteId=item.get('siteId')
pdf_data = {
'itemId': itemId,
'itemType': itemType,
'templateType': templateType,
'widgetImage': widgetImage,
'publishTime': publishTime,
'widgetTitle': widgetTitle,
'widgetContent': widgetContent,
'route': route,
'siteId': siteId,
}
df = pd.DataFrame([pdf_data])
df.to_csv('36kr_newsflash.csv', encoding='utf-8', mode='a', index=False, header=not pd.io.common.file_exists('36kr_newsflash.csv')) # pyright: ignore[reportAttributeAccessIssue]