Python爬虫实战之哔站高清视频爬取

419 阅读2分钟

Python爬虫实战之哔站高清无水印视频爬取:

哔哩哔哩视频批量爬取博主主页视频!

爬虫第一部:

1.

确定爬取目标:伢伢gagako的个人空间_哔哩哔哩_bilibili

1.png

分析数据放置位置:

2.png

2.

分析完数据位置,直接上代码:

import os
import re
import subprocess
import threading
from os import makedirs
from os import path
from queue import Queue

import requests

q = Queue(22)

data_temp = 'BiliBili_临时'  # 视频与音频的零时保存文件
save_path = 'BiliBili_视频'  # 真实保存路径

try:  # 创建缓存文件夹
    makedirs(data_temp)
except Exception as e:
    print(e)

try:  # 创建保存文件夹
    makedirs(save_path)
except Exception as e:
    print(e)

# 重复运行的时候,保存的视频就跳过
file_data_list = os.listdir(save_path)  # 获取文件里面已经保存的数据

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
    'Referer': 'https://www.bilibili.com/'
}


def main():
    for i in range(1, 5):
        up_url = f'https://api.bilibili.com/x/space/wbi/arc/search?mid=632887&ps=30&tid=0&pn={i}&keyword=&order=pubdate&order_avoided=true&w_rid=793b5766254212805f363285ae6ef1c9&wts=1673839673'

            #    "https: // api.bilibili.com / x / polymer / web - space / home / seasons_series?mid = 3493109610580914 & page_num = 1 & page_size = 10”
        # up_url = f'https://api.bilibili.com/x/space/arc/search?mid=487939159&pn={i}&ps=30&index='
        # https: // api.bilibili.com / x / space / wbi / arc / search?mid = 21648772 & ps = 30 & tid = 0 & pn = 1 & keyword = & order = pubdate & order_avoided = true & w_rid = 180a63f98a4ae4c21b3e8bc0ea725dad & wts = 1673746061
        print(up_url)
        up_json_data = requests.get(up_url, headers=headers)
        v_list = up_json_data.json()['data']['list']['vlist']
        for j in v_list:
            title = j['title']
            bv_id = j['bvid']
            # if f'{title}.mp4' not in file_data_list:  # 如果视频已经存在就跳过
            # 获取视频的名称和他的链接
            parser_html(title, f'https://www.bilibili.com/video/{bv_id}')

def parser_html(title, video_url):
    """根据url提取音频和视频"""
    html = requests.get(video_url, headers)
    json_data = re.findall('<script>window.__playinfo__=(.*?)</script>', html.text)[0]
    null = None
    false = False
    true = True
    json_data = eval(json_data)['data']['dash']
    audio_url = json_data['audio'][0]['backupUrl'][0]
    audio_place = path.join(data_temp, f'{title}.mp3')
    video_url = json_data['video'][0]['backupUrl'][0]
    video_place = path.join(data_temp, f'{title}.mp4')

    q.put([title, audio_url, audio_place, video_url, video_place])
    print(f'存放队列数量{q.qsize()}')
    # download_data(title, audio_url, audio_place, video_url, video_place)


def download_data():
    while True:
        print(f'提取队列数量{q.qsize()}')
        try:
            title, audio_url, audio_place, video_url, video_place = q.get(timeout=15)
        except:
            print('视频都获取完成')
            break
        # 保存音频
        audio = requests.get(audio_url, headers=headers, stream=True)
        f_a = open(audio_place, 'wb')
        for i in audio.iter_content(chunk_size=1024):
            f_a.write(i)
        f_a.close()

        # 保存视频
        video = requests.get(video_url, headers=headers, stream=True)
        f_v = open(video_place, 'wb')
        for i in video.iter_content(chunk_size=1024):
            f_v.write(i)
        f_v.close()

        print(f'{title} 已经下载完成')
        merge_data(title, video_place, audio_place)


def merge_data(title, video_place, audio_place):
    cmd = f"ffmpeg.exe -loglevel quiet -i {video_place} -i {audio_place} -acodec copy -vcodec copy {save_path}\\{title}.mp4"
    # os.system(cmd)
    subprocess.run(cmd, shell=True)
    try:  # 合并完成后删除视频和音频
        os.remove(video_place)
    except Exception as e1:
        print(e1)

    try:
        os.remove(audio_place)
    except Exception as e2:
        print(e2)


if __name__ == '__main__':
    t1 = threading.Thread(target=main)  # 生产者 获取视频的链接
    t1.start()

    list_t = []  # 消费者 保存数据 操作下载后的数据
    for i in range(10):
        t2 = threading.Thread(target=download_data)
        t2.start()
        list_t.append(t2)

    for i in list_t:  # 等待所有子线程结束
        i.join()
    t1.join()

这里我为了爬取速度加入了多线程爬取,因为B站的是m3u8格式的,所以这里用到了FFmpeg合成程序,我会打包放在程序里。

3.爬取效果展示:

3.png

4.

代码及工具:阿里云盘:哔站视频爬取工具 www.aliyundrive.com/s/pDxLUJEfH… 百度网盘:链接: pan.baidu.com/s/1WDNxcwA4… 提取码: t19b

最后有什么疑问,也欢迎大家私下博主,谢谢大家。