B站热门视频数据爬取并可视化

475 阅读4分钟

B站热门接口 url: https://api.bilibili.com/x/web-interface/popular?ps=20&pn=1

ps:每页查询的记录数量

pn:当前页号

有数据时的响应,为了方便查看,这里使用的参数: ps=20&pn=1

{
  "code": 0,
  "message": "0",
  "ttl": 1,
  "data": {
    "list": [
      {
        "aid": 409920363,
        "videos": 1,
        "tid": 138,
        "tname": "搞笑",
        "copyright": 1,
        "pic": "http://i2.hdslb.com/bfs/archive/585d1431a2760eb3a93d9aa6f95f1d2e7ac733e9.jpg",
        "title": "好好好,这样玩是吧!",
        "pubdate": 1702867902,
        "ctime": 1702867902,
        "desc": "",
        "state": 0,
        "duration": 184,
        "mission_id": 4008759,
        "rights": {
          "bp": 0,
          "elec": 0,
          "download": 0,
          "movie": 0,
          "pay": 0,
          "hd5": 1,
          "no_reprint": 1,
          "autoplay": 1,
          "ugc_pay": 0,
          "is_cooperation": 0,
          "ugc_pay_preview": 0,
          "no_background": 0,
          "arc_pay": 0,
          "pay_free_watch": 0
        },
        "owner": {
          "mid": 1155574439,
          "name": "真子日记",
          "face": "https://i2.hdslb.com/bfs/face/025e62927222dc7979b00eca7c19c55f80587c08.jpg"
        },
        "stat": {
          "aid": 409920363,
          "view": 616219,
          "danmaku": 2015,
          "reply": 1748,
          "favorite": 4246,
          "coin": 9287,
          "share": 2930,
          "now_rank": 0,
          "his_rank": 0,
          "like": 70150,
          "dislike": 0,
          "vt": 0,
          "vv": 616219
        },
        "dynamic": "",
        "cid": 1371388574,
        "dimension": {
          "width": 1080,
          "height": 1920,
          "rotate": 0
        },
        "short_link_v2": "https://b23.tv/BV1PG411Y7cz",
        "first_frame": "http://i0.hdslb.com/bfs/storyff/n231218sa1n01dms3x6j5s2kqx5b5m6d_firsti.jpg",
        "pub_location": "广东",
        "bvid": "BV1PG411Y7cz",
        "season_type": 0,
        "is_ogv": false,
        "ogv_info": null,
        "enable_vt": 0,
        "ai_rcmd": null,
        "rcmd_reason": {
          "content": "6万点赞",
          "corner_mark": 0
        }
      }
    ],
    "no_more": false
  }
}

无数据时的响应

{
    "code": 0,
    "message": "0",
    "ttl": 1,
    "data": {"list":[],"no_more":true}
}

no_more=true:表示没有更多数据,所以就按这个结果来判断是否继续请求数据

这个 git 仓库有B站各个接口的分析: github.com/SocialSiste…

配置文件

{
  "url": "https://api.bilibili.com/x/web-interface/popular",
  "headers": {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
  }
}

脚本源码

from datetime import datetime
import numpy as np
from numpy import nan as NA
import json
import requests
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba

def loadConfig():
    """此方法用于读取配置文件"""
    with open('config.json') as config_file:
        config = json.load(config_file)
    return config


# 读取全局配置
config = loadConfig()


def requesData(pageTotal, pageNum):
    """此方法用于请求数据"""
    params = {
        'ps': pageTotal,
        'pn': pageNum
    }
    response = requests.get(config['url'], headers=config['headers'], params=params)
    response.encoding = 'utf-8'
    return json.loads(response.text)


def collation(df):
    """此方法用于整理数据"""
    # 删除无用数据
    drop_columns = ['bvid', 'videos', 'tid', 'pic', 'desc', 'state', 'rights',
                    'owner', 'dynamic', 'cid', 'dimension', 'short_link_v2',
                    'first_frame', 'season_type', 'is_ogv', 'ogv_info', 'enable_vt',
                    'ai_rcmd', ]
    df.drop(columns=drop_columns, inplace=True)
    # copyright 视频类型,1:原创 2:转载
    df['copyright'] = df['copyright'].map(lambda x: '原创' if x == '1' else '转载')
    # pubdate 稿件发布时间,单位秒级时间戳,转化为 datetime
    df['pubdate'] = df['pubdate'].map(lambda x: pd.to_datetime(x, unit='s'))
    # ctime 稿件总时长,单位秒级时间戳,转化为 datetime
    df['ctime'] = df['ctime'].map(lambda x: pd.to_datetime(x, unit='s'))
    # duration 稿件总时长(所有分P),单位为秒,转化为 %H:%M 或 %M:%S
    df['duration'] = df['duration'].map(time_format)
    # rcmd_reason 推荐原因,取出 content 内容
    df['rcmd_reason'] = df['rcmd_reason'].map(lambda x: x['content'] if x['content'] != '' else NA)
    # stat 视频状态,取出以下属性单独为一列
    # like 获赞数,share 分享数,coin 投币数,favorite 收藏数,reply 评论数,danmaku 弹幕数,view 播放数
    stat = df['stat']
    df.drop('stat', axis=1, inplace=True)
    stat_df = pd.json_normalize(stat)
    stat_df.drop(columns=['now_rank', 'his_rank', 'dislike', 'vt', 'vv'], inplace=True)
    # 连接 df 和 stat_df
    df_stat = pd.merge(df, stat_df, how='left', on='aid')
    return df_stat


def time_format(x):
    stamp = pd.to_datetime(x, unit='s')
    if stamp.hour == 0:
        return stamp.strftime('%M:%S')
    else:
        return stamp.strftime('%H:%M:%S')


def requestAllToExcle():
    """此方法用于请求所有数据保存至 excel"""
    global drop_col
    resource = pd.DataFrame()
    i = 1
    while True:
        respData = requesData(50, i)
        # 当有数据时,保存至 excel 中
        if str(respData['data']['no_more']).lower() == 'false':
            # 字典转换为 DataFrame
            df = pd.DataFrame(respData['data']['list'])
            # 去除无用数据
            df = collation(df)
            # 追加到 resource
            resource = pd.concat([resource, df])
            i += 1
        else:
            print('ps=', i)
            break
    # 保存前处理一下 'mission_id', 'redirect_url', 'season_id', 'up_from_v2', 'premiere'
    # 这五个属性有的视频有,有的视频没有,所以放最后处理
    drop_cols = ['redirect_url', 'season_id', 'up_from_v2', 'mission_id', 'premiere']
    for col in drop_cols:
        if col in resource.columns:
            print(col, type(col))
        try:
            resource.drop(col, axis=1, inplace=True)
        except KeyError as e:
            print(str(e))
    # 保存至 excel
    resource.to_excel(excel_writer='data.xlsx')


def config_plt():
    """此方法用于配置 pyplot"""
    # 显示中文标签
    plt.rcParams['font.family'] = ['SimHei']


def drawBar_by_tname(data):
    """此方法用于绘制每个分区热门数量的统计图"""
    fig, ax = plt.subplots(figsize=(20, 5))
    x = data['tname'].value_counts()
    x.plot.bar()
    plt.show()


def drawBar_by_location(data):
    """此方法用于绘制每个地区热门数量的统计图"""
    fig, ax = plt.subplots(figsize=(20, 5))
    x = data['pub_location'].value_counts()
    x.plot.bar()
    plt.show()


def drawWordCloud_by_title(data):
    """此方法用于绘制 title 词云"""
    # 将 title 列所有文本连接在一起
    text = data['title'].str.cat(sep=' ')
    # 使用 jieba 库对文本进行分词
    text = ' '.join(jieba.cut(text))
    # 创建词云并设置中文字体
    wordcloud = WordCloud(width=800, height=800, background_color='white',
                          min_font_size=10, font_path='SimHei.ttf')
    wordcloud.generate(text)
    plt.figure(figsize=(8, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.show()

def drawBar_by_view(data):
    """此方法用于绘制每个分区播放数的统计图"""
    fig, ax = plt.subplots(figsize=(20, 5))
    grouped = data.groupby('tname')['view'].sum()
    max_value = grouped.max()
    y_ticks = np.arange(0, np.ceil(max_value / 1000) * 1000 + 1000, 1000)
    ax.get_yaxis().get_major_locator().set_params(integer=True)
    plt.yticks(y_ticks)
    grouped.plot.bar()
    plt.show()

def main():
    # 请求全部数据并保存至 data.xlsx
    # requestAllToExcle()
    # 读取 data.xlsx 中的数据并绘制图形
    resource = pd.read_excel('data.xlsx')
    # # 配置 plt
    config_plt()
    # 绘制每个分区热门数量的统计图
    drawBar_by_tname(resource)
    # 绘制每个地区热门数量的统计图
    drawBar_by_location(resource)
    # 绘制 title 词云
    drawWordCloud_by_title(resource)
    # 绘制每个分区播放数的统计图
    before = datetime.now().timestamp()
    drawBar_by_view(resource)
    after = datetime.now().timestamp()
    diff_minutes, diff_seconds = divmod(after-before, 60)
    print(f"{int(diff_minutes)}:{int(diff_seconds):02d}")


if __name__ == "__main__":
    main()

可以取消 requestAllToExcle 方法的注释,获取最新的数据保存至 data.xlsx 中。