我是在跟B站Up主‘路飞学城IT’学习的过程中完成的这个案例。
怎么说呢,感触最深的就是,爬虫这东西,学的越晚越难啊,前面前辈把网站都爬了好多遍了,那些网站都升级了反扒机制,导致我们后来的都爬不动了,ヾ(◍°∇°◍)ノ゙。虽然有反扒机制就有反反扒策略,但是对于我们这些萌新来说真的好难啊,┭┮﹏┭┮ !!!总归在我的不懈努力下,自己独立(差不多算独立)完成了这个案例,但是还是参考了前辈的博客。
梨视频官网:梨视频官网-有故事的短视频-Pear Video
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @FileName :03线程池爬取梨视频.py
# @Time :2023/8/9 15:52
# @Author :Yuan
import time
from multiprocessing.dummy import Pool
import requests
from lxml import etree
import random
import os
path = './梨视频/'
if not os.path.exists(path):
os.mkdir(path)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188'
}
def download_video(div):
video_data = requests.get(url=div['url'], headers=headers).content
fileName = path + div['name']
with open(fileName, 'wb') as fp:
fp.write(video_data)
print(fileName + '下载完成')
if __name__ == "__main__":
start_time = time.time()
url = 'https://www.pearvideo.com/category_1'
page_text = requests.get(url=url, headers=headers).text
et = etree.HTML(page_text)
li_list = et.xpath('//*[@id="listvideoListUl"]/li[@class="categoryem "]')
# print(li_list)
# 存取视频的链接,一块下载
video_list = []
for li in li_list:
# 获取视频的链接
video_text = li.xpath('.//a/@href')[0]
# 在视频的链接中裁取视频代码
contId = video_text.split('_')[1]
# print(contId)
# 视频播放地址
src = 'https://www.pearvideo.com/' + video_text
# print(src)
# 获取视频名字
name_page = requests.get(url=src, headers=headers).text
et2 = etree.HTML(name_page)
video_name = et2.xpath('//h1[@class="video-tt"]/text()')[0] + '.mp4'
# print(video_name)
# 在请求时,header中加一个Referer,表示我从哪个视频过来的,否则会提示视频下架
headers['Referer'] = src
# ajax 获取视频地址字典
ajax_url = 'https://www.pearvideo.com/videoStatus.jsp'
params = {
'contId': contId,
# 随机生成一个数传入就行
'mrd': random.random()
}
# 通过ajax获取包含视频下载地址的json
video_json = requests.get(url=ajax_url, params=params, headers=headers).json()
# print(video_json)
# 视频假的地址
video_fake_url = video_json['videoInfo']['videos']['srcUrl']
# print(video_fake_url)
# 真地址:https://video.pearvideo.com/mp4/adshort/20190506/cont-1551306-13881116_adpkg-ad_hd.mp4
# 假地址:https://video.pearvideo.com/mp4/adshort/20190506/1691569035447-13881116_adpkg-ad_hd.mp4
# 拼接真的地址
# https://video.pearvideo.com/mp4/adshort/20190506/1691569035447 13881116_adpkg ad_hd.mp4
str_list = video_fake_url.split('-')
# 查找假的代码 然后删除
fake_code = str_list[0].split('/')[-1]
# 真实地址的第一部分
real_url_0 = str_list[0].replace(fake_code, '')
# 真实的视频下载地址
video_real_url = real_url_0 + 'cont-' + contId + '-' + str_list[1] + '-' + str_list[2]
# print(video_real_url)
video_dic = {
'url': video_real_url,
'name': video_name,
}
video_list.append(video_dic)
# 创建线程池下载视频
pool = Pool(3)
pool.map(download_video, video_list) # 用时14s
# for div in video_list: # 用时15s
# download_video(div)
end_time = time.time()
print('全部完成,共耗时:%ds' % (end_time - start_time))