Python爬虫实战之哔站高清无水印视频爬取:
哔哩哔哩视频批量爬取博主主页视频!
爬虫第一部:
1.
确定爬取目标:伢伢gagako的个人空间_哔哩哔哩_bilibili
分析数据放置位置:
2.
分析完数据位置,直接上代码:
import os
import re
import subprocess
import threading
from os import makedirs
from os import path
from queue import Queue
import requests
q = Queue(22)
data_temp = 'BiliBili_临时' # 视频与音频的零时保存文件
save_path = 'BiliBili_视频' # 真实保存路径
try: # 创建缓存文件夹
makedirs(data_temp)
except Exception as e:
print(e)
try: # 创建保存文件夹
makedirs(save_path)
except Exception as e:
print(e)
# 重复运行的时候,保存的视频就跳过
file_data_list = os.listdir(save_path) # 获取文件里面已经保存的数据
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
'Referer': 'https://www.bilibili.com/'
}
def main():
for i in range(1, 5):
up_url = f'https://api.bilibili.com/x/space/wbi/arc/search?mid=632887&ps=30&tid=0&pn={i}&keyword=&order=pubdate&order_avoided=true&w_rid=793b5766254212805f363285ae6ef1c9&wts=1673839673'
# "https: // api.bilibili.com / x / polymer / web - space / home / seasons_series?mid = 3493109610580914 & page_num = 1 & page_size = 10”
# up_url = f'https://api.bilibili.com/x/space/arc/search?mid=487939159&pn={i}&ps=30&index='
# https: // api.bilibili.com / x / space / wbi / arc / search?mid = 21648772 & ps = 30 & tid = 0 & pn = 1 & keyword = & order = pubdate & order_avoided = true & w_rid = 180a63f98a4ae4c21b3e8bc0ea725dad & wts = 1673746061
print(up_url)
up_json_data = requests.get(up_url, headers=headers)
v_list = up_json_data.json()['data']['list']['vlist']
for j in v_list:
title = j['title']
bv_id = j['bvid']
# if f'{title}.mp4' not in file_data_list: # 如果视频已经存在就跳过
# 获取视频的名称和他的链接
parser_html(title, f'https://www.bilibili.com/video/{bv_id}')
def parser_html(title, video_url):
"""根据url提取音频和视频"""
html = requests.get(video_url, headers)
json_data = re.findall('<script>window.__playinfo__=(.*?)</script>', html.text)[0]
null = None
false = False
true = True
json_data = eval(json_data)['data']['dash']
audio_url = json_data['audio'][0]['backupUrl'][0]
audio_place = path.join(data_temp, f'{title}.mp3')
video_url = json_data['video'][0]['backupUrl'][0]
video_place = path.join(data_temp, f'{title}.mp4')
q.put([title, audio_url, audio_place, video_url, video_place])
print(f'存放队列数量{q.qsize()}')
# download_data(title, audio_url, audio_place, video_url, video_place)
def download_data():
while True:
print(f'提取队列数量{q.qsize()}')
try:
title, audio_url, audio_place, video_url, video_place = q.get(timeout=15)
except:
print('视频都获取完成')
break
# 保存音频
audio = requests.get(audio_url, headers=headers, stream=True)
f_a = open(audio_place, 'wb')
for i in audio.iter_content(chunk_size=1024):
f_a.write(i)
f_a.close()
# 保存视频
video = requests.get(video_url, headers=headers, stream=True)
f_v = open(video_place, 'wb')
for i in video.iter_content(chunk_size=1024):
f_v.write(i)
f_v.close()
print(f'{title} 已经下载完成')
merge_data(title, video_place, audio_place)
def merge_data(title, video_place, audio_place):
cmd = f"ffmpeg.exe -loglevel quiet -i {video_place} -i {audio_place} -acodec copy -vcodec copy {save_path}\\{title}.mp4"
# os.system(cmd)
subprocess.run(cmd, shell=True)
try: # 合并完成后删除视频和音频
os.remove(video_place)
except Exception as e1:
print(e1)
try:
os.remove(audio_place)
except Exception as e2:
print(e2)
if __name__ == '__main__':
t1 = threading.Thread(target=main) # 生产者 获取视频的链接
t1.start()
list_t = [] # 消费者 保存数据 操作下载后的数据
for i in range(10):
t2 = threading.Thread(target=download_data)
t2.start()
list_t.append(t2)
for i in list_t: # 等待所有子线程结束
i.join()
t1.join()
这里我为了爬取速度加入了多线程爬取,因为B站的是m3u8格式的,所以这里用到了FFmpeg合成程序,我会打包放在程序里。
3.爬取效果展示:
4.
代码及工具:阿里云盘:哔站视频爬取工具 www.aliyundrive.com/s/pDxLUJEfH… 百度网盘:链接: pan.baidu.com/s/1WDNxcwA4… 提取码: t19b
最后有什么疑问,也欢迎大家私下博主,谢谢大家。