写到前面:python3 BeautifulSoup4 requests
知乎热榜
from json import encoder
from bs4 import BeautifulSoup
import requests
import json
# 设置 头数据
headers = {'scheme': 'https',
'accept': 'text/html, application/xhtml+xml, application/xml',
'accept-language': 'zh-CN, zh',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0 Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'
}
# get请求 热榜地址
r = requests.get(
'https://www.zhihu.com/billboard', headers=headers)
# 解析
soup = BeautifulSoup(r.text, 'html.parser')
# 获取页面内的js数据 并转换类型
data = json.loads(soup.find('script', {'id': 'js-initialData'}).get_text())
# 解析list类型的数据
jsonStr = data['initialState']['topstory']['hotList']
# 打印类型
print(type(jsonStr))
# 解析为json字符串
txt = json.dumps(jsonStr, ensure_ascii=False)
textStr = ''
for index, item in enumerate(jsonStr):
titleArea = item['target']['titleArea']['text']
excerptArea = item['target']['excerptArea']['text']
imageArea = item['target']['imageArea']['url']
metricsArea = item['target']['metricsArea']['text']
link = item['target']['link']['url']
itemStr = '{}:{} {} {} {} {}'.format(
index+1, titleArea, metricsArea, link, imageArea, excerptArea)
print(itemStr)
textStr += itemStr+'\n'
# 保存json的字符串
with open('zhihu.txt', 'w', encoding="utf-8") as f:
f.write(textStr)
微博热榜
from types import NoneType
from bs4 import BeautifulSoup
import requests
import json
# 设置 头数据
headers = {'scheme': 'https',
'accept': 'text/html, application/xhtml+xml, application/xml',
'accept-language': 'zh-CN, zh',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0 Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'
}
# get请求 热榜地址
r = requests.get(
'https://weibo.com/ajax/statuses/hot_band', headers=headers)
# 加在请求结果为对象
data = json.loads(r.text)
# 解析对象
print(type(data['data']['band_list']))
# 总计内容
textStr = ''
# 遍历列表数据
for index, item in enumerate(data['data']['band_list']):
if 'topic_ad' not in item:
# 标题
note = item['note']
word = item['word']
# 热值
raw_hot = item['num']
# 链接
url=''
if 'mblog' in item:
url = item['mblog']['text']
soup = BeautifulSoup(url, 'html.parser')
if type(soup.find('a')) != NoneType:
url=soup.find('a').get('href')
# 标签
if 'icon_desc' in item:
label_name = item['icon_desc']
else:
label_name = '无'
itemStr = '{}:{} {} {} {}'.format(index+1, label_name, word, raw_hot,'https:'+url)
print(itemStr)
textStr += str(itemStr+'\n')
# 写入文件
with open('weibo.txt', 'w', encoding="utf-8") as f:
f.write(textStr)