Python Ajax爬取微博个人博客数据

131 阅读2分钟

本文已参与「新人创作礼」活动,一起开启掘金创作之路

利用request、pyquery、xlwings等库抓取微博个人博客数据。

(1)抓取目标网址

(2)用 Chrome 浏览器或360快速浏览器分析微博网页结构。

(3)按功能不同编写不同方法组织代码。

(4)抓取100条微博数据。

# -*- coding: utf-8 -*-
from urllib.parse import urlencode
import requests
from pyquery import PyQuery as pq
import xlwings as xw

def get_page(page):
    global base_url
    headers = {
        'Host': 'm.weibo.cn',
        'Referer': 'https://m.weibo.cn/u/2830678474',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
        'Cookie': 'M_WEIBOCN_PARAMS=oid%3D4703960021074409%26luicode%3D10000011%26lfid%3D1076032830678474; expires=Wed, 17-Nov-2021 00:39:42 GMT; Max-Age=600; path=/; domain=.weibo.cn; HttpOnly'
    }
    params = {
        'type': 'uid',
        'value': '2830678474',
        'containerid': '1076032830678474',
        'page': page
    }
    url = base_url + urlencode(params)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json()
    except requests.ConnectionError as e:
        print('Error', e.args)


def parse_page(json_):
    global wblist
    if json_:
        items = json_.get('data').get('cards')
        for item in items:
            item = item.get('mblog')
            print(item)
            wblist.append([item.get('id'),
                           pq(item.get('text')).text(),
                           item.get('attitudes_count'),
                           item.get('comments_count'),
                           item.get('reposts_count')])



if __name__ == '__main__':
    global wblist,base_url
    wblist=[['id', 'text', 'attitudes', 'comments', 'reposts']]
    base_url = 'https://m.weibo.cn/api/container/getIndex?'
    for page in range(1, 20):
        json_ = get_page(page)
        results = parse_page(json_)
    # 写入Excel文件
    wb = xw.Book('./data.xlsx')
    sht = wb.sheets('Sheet4')
    sht.range('a1').value = wblist  # 将数据添加到表格中
# -*- coding: utf-8 -*-
from urllib.parse import urlencode
import requests
from pyquery import PyQuery as pq
import json
base_url = 'https://m.weibo.cn/api/container/getIndex?'

headers = {
    'Host': 'm.weibo.cn',
    'Referer': 'https://m.weibo.cn/u/2830678474',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
    'Cookie': 'M_WEIBOCN_PARAMS=oid%3D4703960021074409%26luicode%3D10000011%26lfid%3D1076032830678474; expires=Wed, 17-Nov-2021 00:39:42 GMT; Max-Age=600; path=/; domain=.weibo.cn; HttpOnly'
}

def get_page(page):
    params = {
        'type': 'uid',
        'value': '2830678474',
        'containerid': '1076032830678474',
        'page': page
    }
    url = base_url + urlencode(params)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json()
    except requests.ConnectionError as e:
        print('Error', e.args)


def parse_page(json_):
    if json_:
        items = json_.get('data').get('cards')
        for item in items:
            item = item.get('mblog')
            yield {
            'id': item.get('id'),
            'text': pq(item.get('text')).text(),
            'attitudes': item.get('attitudes_count'),
            'comments': item.get('comments_count'),
            'reposts' : item.get('reposts_count'),
        }
            
def write_to_file(content):
    """
    存储数据,通过JSON库的dumps()方法实现字典的序列化,写入到一个文本文件!
    :param content:
    :return:
    """
    with open('result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + ',\n')
if __name__ == '__main__':
    count=0
    for page in range(1, 16):
        json_ = get_page(page)
        results = parse_page(json_)
        for result in results:
            print(result)
            count+=1
            write_to_file(result)
    print("抓取的数量为:",count)

结果 在这里插入图片描述 在这里插入图片描述

参考链接