本文已参与「新人创作礼」活动,一起开启掘金创作之路
利用request、pyquery、xlwings等库抓取微博个人博客数据。
(1)抓取目标网址
(2)用 Chrome 浏览器或360快速浏览器分析微博网页结构。
(3)按功能不同编写不同方法组织代码。
(4)抓取100条微博数据。
# -*- coding: utf-8 -*-
from urllib.parse import urlencode
import requests
from pyquery import PyQuery as pq
import xlwings as xw
def get_page(page):
global base_url
headers = {
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2830678474',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Cookie': 'M_WEIBOCN_PARAMS=oid%3D4703960021074409%26luicode%3D10000011%26lfid%3D1076032830678474; expires=Wed, 17-Nov-2021 00:39:42 GMT; Max-Age=600; path=/; domain=.weibo.cn; HttpOnly'
}
params = {
'type': 'uid',
'value': '2830678474',
'containerid': '1076032830678474',
'page': page
}
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Error', e.args)
def parse_page(json_):
global wblist
if json_:
items = json_.get('data').get('cards')
for item in items:
item = item.get('mblog')
print(item)
wblist.append([item.get('id'),
pq(item.get('text')).text(),
item.get('attitudes_count'),
item.get('comments_count'),
item.get('reposts_count')])
if __name__ == '__main__':
global wblist,base_url
wblist=[['id', 'text', 'attitudes', 'comments', 'reposts']]
base_url = 'https://m.weibo.cn/api/container/getIndex?'
for page in range(1, 20):
json_ = get_page(page)
results = parse_page(json_)
# 写入Excel文件
wb = xw.Book('./data.xlsx')
sht = wb.sheets('Sheet4')
sht.range('a1').value = wblist # 将数据添加到表格中
# -*- coding: utf-8 -*-
from urllib.parse import urlencode
import requests
from pyquery import PyQuery as pq
import json
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2830678474',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Cookie': 'M_WEIBOCN_PARAMS=oid%3D4703960021074409%26luicode%3D10000011%26lfid%3D1076032830678474; expires=Wed, 17-Nov-2021 00:39:42 GMT; Max-Age=600; path=/; domain=.weibo.cn; HttpOnly'
}
def get_page(page):
params = {
'type': 'uid',
'value': '2830678474',
'containerid': '1076032830678474',
'page': page
}
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('Error', e.args)
def parse_page(json_):
if json_:
items = json_.get('data').get('cards')
for item in items:
item = item.get('mblog')
yield {
'id': item.get('id'),
'text': pq(item.get('text')).text(),
'attitudes': item.get('attitudes_count'),
'comments': item.get('comments_count'),
'reposts' : item.get('reposts_count'),
}
def write_to_file(content):
"""
存储数据,通过JSON库的dumps()方法实现字典的序列化,写入到一个文本文件!
:param content:
:return:
"""
with open('result.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + ',\n')
if __name__ == '__main__':
count=0
for page in range(1, 16):
json_ = get_page(page)
results = parse_page(json_)
for result in results:
print(result)
count+=1
write_to_file(result)
print("抓取的数量为:",count)
结果