python 爬取微博知乎热榜

956 阅读1分钟

写到前面:python3 BeautifulSoup4 requests

知乎热榜

from json import encoder
from bs4 import BeautifulSoup

import requests

import json


# 设置 头数据
headers = {'scheme': 'https',
           'accept': 'text/html, application/xhtml+xml, application/xml',
           'accept-language': 'zh-CN, zh',
           'user-agent': 'Mozilla/5.0 (Windows NT 10.0 Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'
           }

# get请求 热榜地址
r = requests.get(
    'https://www.zhihu.com/billboard', headers=headers)


# 解析
soup = BeautifulSoup(r.text, 'html.parser')

# 获取页面内的js数据 并转换类型
data = json.loads(soup.find('script', {'id': 'js-initialData'}).get_text())

# 解析list类型的数据
jsonStr = data['initialState']['topstory']['hotList']

# 打印类型
print(type(jsonStr))


# 解析为json字符串
txt = json.dumps(jsonStr, ensure_ascii=False)

textStr = ''

for index, item in enumerate(jsonStr):
    titleArea = item['target']['titleArea']['text']
    excerptArea = item['target']['excerptArea']['text']
    imageArea = item['target']['imageArea']['url']
    metricsArea = item['target']['metricsArea']['text']
    link = item['target']['link']['url']
    itemStr = '{}:{} {} {} {} {}'.format(
        index+1,  titleArea, metricsArea, link, imageArea, excerptArea)
    print(itemStr)
    textStr += itemStr+'\n'

# 保存json的字符串
with open('zhihu.txt', 'w', encoding="utf-8") as f:
    f.write(textStr)

微博热榜

from types import NoneType
from bs4 import BeautifulSoup
import requests
import json


# 设置 头数据
headers = {'scheme': 'https',
           'accept': 'text/html, application/xhtml+xml, application/xml',
           'accept-language': 'zh-CN, zh',
           'user-agent': 'Mozilla/5.0 (Windows NT 10.0 Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'
           }

# get请求 热榜地址
r = requests.get(
    'https://weibo.com/ajax/statuses/hot_band', headers=headers)

# 加在请求结果为对象
data = json.loads(r.text)

# 解析对象
print(type(data['data']['band_list']))

# 总计内容
textStr = ''

# 遍历列表数据
for index, item in enumerate(data['data']['band_list']):
    if 'topic_ad' not in item:
        # 标题
        note = item['note']
        word = item['word']
        # 热值
        raw_hot = item['num']
        # 链接
        url=''
        if 'mblog' in item:
            url = item['mblog']['text']
            soup = BeautifulSoup(url, 'html.parser')
            if type(soup.find('a')) != NoneType:
                url=soup.find('a').get('href')
        # 标签
        if 'icon_desc' in item:
            label_name = item['icon_desc']
        else:
            label_name = '无'
        itemStr = '{}:{} {} {} {}'.format(index+1, label_name, word, raw_hot,'https:'+url)
        print(itemStr)
        textStr += str(itemStr+'\n')


# 写入文件
with open('weibo.txt', 'w', encoding="utf-8") as f:
    f.write(textStr)