微博爬虫

618 阅读2分钟

爬取热搜

API为:weibo.com/ajax/side/h…

import json
import time
import requests
import sqlalchemy
import pandas as pd


path = 'weibo.sqlite3'
engine = sqlalchemy.create_engine(f'sqlite:///{path}')
conn = engine.connect()

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
}

def get_data():
    """微博热搜
    Args:
        params (dict): {}
    Returns:
        json: {title: 标题, url: 地址, num: 热度数值, hot: 热搜等级}
    """

    data = []
    response = requests.get("https://weibo.com/ajax/side/hotSearch", headers=headers)
    data_json = response.json()['data']['realtime']
    jyzy = {
        '电影': '影',
        '剧集': '剧',
        '综艺': '综',
        '音乐': '音'
    }

    for data_item in data_json:
        hot = ''
        # 如果是广告,则不添加
        if 'is_ad' in data_item:
            continue
        if 'flag_desc' in data_item:
            hot = jyzy.get(data_item['flag_desc'])
        if 'is_boom' in data_item:
            hot = '爆'
        if 'is_hot' in data_item:
            hot = '热'
        if 'is_fei' in data_item:
            hot = '沸'
        if 'is_new' in data_item:
            hot = '新'

        dic = {
            'title': data_item['note'],
            'url': 'https://s.weibo.com/weibo?q=%23' + data_item['word'] + '%23',
            'num': data_item['num'],
            'hot': hot
        }
        data.append(dic)

    return data


if __name__ == '__main__':
    data = get_data()
    data = pd.DataFrame(data=data)
    # 找到所有的热搜
    sql = 'select distinct title from weibo_resou'
    crawled_resou = pd.read_sql(sql, conn)['title'].values.tolist()
    data = data[~data['title'].isin(crawled_resou)]
    print(data)
    data.to_sql('weibo_resou', conn, if_exists='append', index=False, chunksize=100)
    conn.close()

爬取微博内容

需要登录微博拿到自己的cookie

import time
import requests
import sqlalchemy
from bs4 import BeautifulSoup
import pandas as pd
import pickle as pkl
import torch

BASE_DIR = 'mysite'

path = 'weibo.sqlite3'
engine = sqlalchemy.create_engine(f'sqlite:///{path}')
conn = engine.connect()

# TODO 填写自己的cookie
cookie = '''
'''

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
    'Cookie': cookie.strip(),
    'Referer': 'https://s.weibo.com/weibo?q=%23%E5%B0%8F%E4%BC%99%E6%B5%8F%E8%A7%88%E9%9D%9E%E6%B3%95%E6%B6%89%E9%BB%84APP%E4%BA%94%E5%A4%A9%E8%A2%AB%E9%AA%97142%E4%B8%87%23&Refer=top&page=4',
}

# 加载模型
with open(BASE_DIR + '/notebooks/save_models/data_bundle.pkl', 'rb') as fp:
    data_bundle = pkl.load(fp)

from fastNLP.io.model_io import ModelLoader

model = ModelLoader.load_pytorch_model(BASE_DIR + '/notebooks/save_models/cnn.senti.pkl')
char_vocab = data_bundle.get_vocab('chars')

def predict(sentence):
    """
    预测
    """
    idx = [char_vocab.to_index(s) for s in sentence]
    idx = torch.tensor([idx])
    return model.predict(idx)['pred'][0].item()

def crawl_content(url, resou_id):
    """
    开始爬取微博
    """
    try:
        response = requests.get(url, headers=headers)
    except: 
        print(url, '请求失败')
        return False

    if response.status_code != 200:
        print('响应码: ', response.status_code)
        False 
    soup = BeautifulSoup(response.text, 'html5lib')
    data = []
    for div in soup.select('div[action-type="feed_list_item"] div.card'):
        try:
            content = div.select_one('div.content p.txt').get_text().strip().replace(' ', '').replace('\n', '')
            print(content)
            label = predict(content)
            data.append([content, label])
        except:
            continue
    data = pd.DataFrame(data=data)
    if data.shape[0] > 0:
        data.columns = ['content', 'label']
        data['resou_id'] = resou_id
        data.to_sql('weibo_weibo', conn, if_exists='append', chunksize=100, index=False)
        print('插入成功')
    else:
        print('无数据')
    print(url, 'end')
    # 查看是否有下一页
    if soup.select_one('div.m-page a.next') is None:
        return False
    return True

# 开始爬取
if __name__ == '__main__':
    sql = 'select id, title, url from weibo_resou order by id desc'
    sql = f'''
        select
            id,
            title,
            url,
        from
            weibo_resou
        where
            id not in (
                select distinct resou_id from weibo_weibo
            )
        order by
            id desc
    '''
    df = pd.read_sql(sql, conn)
    for _, row in df.iterrows():
        _id = row['id']
        url = row['url']
        # 爬取20页
        for page_index in range(1, 21):
            _url = url + f'&page={page_index}'
            print(_url, 'start')
            has_next_page = crawl_content(_url, _id)
            time.sleep(2)
            # 没有下一页了
            if has_next_page is False:
                break
        time.sleep(3)
        

爬取微博用户信息

需要登录微博拿到自己的cookie

    def parse_user(self, response):
        html = response.text
        # 获取发布人id
        user_id = response.meta['user_id']
        # 获取user
        user = response.meta['user']
        # 正则获取所在省份
        pattern = r'.*IP属地:(.*?)<\\/span>'
        try:
            province = re.search(pattern, html).group(1).strip().replace('\\t', '').strip()
        except:
            province = None

参考链接

  1. Scrapy中使用cookie. zhuanlan.zhihu.com/p/337212121