数据爬虫:获取某站点新闻数据(36)

65 阅读4分钟

环境准备

  1. 技术:python
  2. 第三方库:requests、pandas
  3. 获取的数据:36kr_newsflash.csv

接口分析Url:

  https://gateway.36kr.com/api/mis/nav/newsflash/list

使用浏览器F2开发者模式中,请求的参数:

{
    'partner_id': 'web',
    'timestamp': timestamp,
    'param': {
        'pageSize': 20,
        'pageEvent': 1,
        'pageCallback': 'eyJmaXJzdElkIjozNjY4NDUwMDA5MDMxNTU3LCJsYXN0SWQiOjM2NjgzNTU4ODU3ODU3MzQsImZpcnN0Q3JlYXRlVGltZSI6MTc3MDE3NjQxOTQ5NiwibGFzdENyZWF0ZVRpbWUiOjE3NzAxNzA2NzQ2NjksImxhc3RQYXJhbSI6IjEifQ',
        'siteId': 1,
        'type': 0,
        'platformId': 2,
    },
}

分析加密字段pageCallback: 在每次请求时都会返回一个相应的pageCallback,此字段是数据为后端生成,应用于下次的接力请求; timestamp:为时间戳

对爬虫进行伪装:

headers = {
    'Accept': '*/*',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Content-Type': 'application/json',
    'Origin': 'https://www.36kr.com',
    'Pragma': 'no-cache',
    'Referer': 'https://www.36kr.com/',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-site',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Not(A:Brand";v="8", "Chromium";v="144", "Google Chrome";v="144"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"'
}

cookies = {
    'sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%2219c0885b730376-0a8736ac2602898-26061d51-2073600-19c0885b7311414%22%2C%22%24device_id%22%3A%2219c0885b730376-0a8736ac2602898-26061d51-2073600-19c0885b7311414%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24errer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24lates2%3AAA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D',
}

使用pandas 进行数据存储:

   pdf_data = {
        'itemId': itemId,
        'itemType': itemType,
        'templateType': templateType,
        'widgetImage': widgetImage,
        'publishTime': publishTime,
        'widgetTitle': widgetTitle,
        'widgetContent': widgetContent,
        'route': route,
        'siteId': siteId,
    }
    df = pd.DataFrame([pdf_data]);
     df.to_csv('36kr_newsflash.csv', encoding='utf-8', mode='a', index=False, header=not pd.io.common.file_exists('36_newsflash.csv')) # pyright: ignore[reportAttributeAccessIssue]

image.png

完整代码:

import requests
import time
import pandas as pd

cookies = {
    'sensorsdata2015jssdkcross': '%7B%22distinct_id%22%3A%2219c0885b730376-0a8736ac2602898-26061d51-2073600-19c0885b73%22%3A%2219c0885b730376-0a8736ac2602898-26061d51-2073600-19c0885b7311414%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%er%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D',
}

headers = {
    'Accept': '*/*',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Content-Type': 'application/json',
    'Origin': 'https://www.36kr.com',
    'Pragma': 'no-cache',
    'Referer': 'https://www.36kr.com/',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-site',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Not(A:Brand";v="8", "Chromium";v="144", "Google Chrome";v="144"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"'
}

timestamp = int(time.time() * 1000)
json_data = {
    'partner_id': 'web',
    'timestamp': timestamp,
    'param': {
        'pageSize': 20,
        'pageEvent': 1,
        'pageCallback': 'eyJmaXJzdElkIjozNjY4NDUwMDA5MDMxNTU3LCJsYXN0SWQiOjM2NjgzNTU4ODU3ODU3MzQsImZpcnN0Q3JlYXRlVGltZSI6MTc3MDE3NjQxOTQ5NiwibGFzdENyZWF0ZVRpbWUiOjE3NzAxNzA2NzQ2NjksImxhc3RQYXJhbSI6IjEifQ',
        'siteId': 1,
        'type': 0,
        'platformId': 2,
    },
}

response = requests.post('https://gateway.36kr.com/api/mis/nav/newsflash/list', cookies=cookies, headers=headers, json=json_data)

data:list=response.json().get('data').get('itemList')
pageCallback=response.json().get('data').get('pageCallback')
hasNextPage=response.json().get('data').get('hasNextPage')
print(f"pageCallback: {pageCallback}, hasNextPage: {hasNextPage}")
for item in data:
    itemId = item.get('itemId')
    itemType = item.get('itemType')
    itempPlateMaterial = item.get('templateMaterial')
    templateType = itempPlateMaterial.get('templateType')
    widgetImage=itempPlateMaterial.get('widgetImage')
    publishTime=itempPlateMaterial.get('publishTime')
    widgetTitle=itempPlateMaterial.get('widgetTitle')
    widgetContent=itempPlateMaterial.get('widgetContent')
    route=item.get('route')
    siteId=item.get('siteId')
    pdf_data = {
        'itemId': itemId,
        'itemType': itemType,
        'templateType': templateType,
        'widgetImage': widgetImage,
        'publishTime': publishTime,
        'widgetTitle': widgetTitle,
        'widgetContent': widgetContent,
        'route': route,
        'siteId': siteId,
    }
    df = pd.DataFrame([pdf_data])
    df.to_csv('36kr_newsflash.csv', encoding='utf-8', mode='a', index=False, header=not pd.io.common.file_exists('36kr_newsflash.csv')) # pyright: ignore[reportAttributeAccessIssue]