Python爬虫爬取梨视频,下载视频到本地

498 阅读2分钟

我是在跟B站Up主‘路飞学城IT’学习的过程中完成的这个案例。

怎么说呢,感触最深的就是,爬虫这东西,学的越晚越难啊,前面前辈把网站都爬了好多遍了,那些网站都升级了反扒机制,导致我们后来的都爬不动了,ヾ(◍°∇°◍)ノ゙。虽然有反扒机制就有反反扒策略,但是对于我们这些萌新来说真的好难啊,┭┮﹏┭┮ !!!总归在我的不懈努力下,自己独立(差不多算独立)完成了这个案例,但是还是参考了前辈的博客

梨视频官网:梨视频官网-有故事的短视频-Pear Video

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @FileName  :03线程池爬取梨视频.py
# @Time      :2023/8/9 15:52
# @Author    :Yuan
import time
from multiprocessing.dummy import Pool
import requests
from lxml import etree
import random
import os

path = './梨视频/'
if not os.path.exists(path):
    os.mkdir(path)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188'
}


def download_video(div):
    video_data = requests.get(url=div['url'], headers=headers).content
    fileName = path + div['name']
    with open(fileName, 'wb') as fp:
        fp.write(video_data)
        print(fileName + '下载完成')


if __name__ == "__main__":
    start_time = time.time()
    url = 'https://www.pearvideo.com/category_1'
    page_text = requests.get(url=url, headers=headers).text
    et = etree.HTML(page_text)
    li_list = et.xpath('//*[@id="listvideoListUl"]/li[@class="categoryem "]')
    # print(li_list)
    # 存取视频的链接,一块下载
    video_list = []
    for li in li_list:
        # 获取视频的链接
        video_text = li.xpath('.//a/@href')[0]
        # 在视频的链接中裁取视频代码
        contId = video_text.split('_')[1]
        # print(contId)
        # 视频播放地址
        src = 'https://www.pearvideo.com/' + video_text
        # print(src)
        # 获取视频名字
        name_page = requests.get(url=src, headers=headers).text
        et2 = etree.HTML(name_page)
        video_name = et2.xpath('//h1[@class="video-tt"]/text()')[0] + '.mp4'
        # print(video_name)
        # 在请求时,header中加一个Referer,表示我从哪个视频过来的,否则会提示视频下架
        headers['Referer'] = src
        # ajax 获取视频地址字典
        ajax_url = 'https://www.pearvideo.com/videoStatus.jsp'
        params = {
            'contId': contId,
            # 随机生成一个数传入就行
            'mrd': random.random()
        }
        # 通过ajax获取包含视频下载地址的json
        video_json = requests.get(url=ajax_url, params=params, headers=headers).json()
        # print(video_json)
        # 视频假的地址
        video_fake_url = video_json['videoInfo']['videos']['srcUrl']
        # print(video_fake_url)
        # 真地址:https://video.pearvideo.com/mp4/adshort/20190506/cont-1551306-13881116_adpkg-ad_hd.mp4
        # 假地址:https://video.pearvideo.com/mp4/adshort/20190506/1691569035447-13881116_adpkg-ad_hd.mp4
        # 拼接真的地址
        # https://video.pearvideo.com/mp4/adshort/20190506/1691569035447    13881116_adpkg  ad_hd.mp4
        str_list = video_fake_url.split('-')
        # 查找假的代码 然后删除
        fake_code = str_list[0].split('/')[-1]
        # 真实地址的第一部分
        real_url_0 = str_list[0].replace(fake_code, '')
        # 真实的视频下载地址
        video_real_url = real_url_0 + 'cont-' + contId + '-' + str_list[1] + '-' + str_list[2]
        # print(video_real_url)
        video_dic = {
            'url': video_real_url,
            'name': video_name,
        }
        video_list.append(video_dic)
    # 创建线程池下载视频
    pool = Pool(3)
    pool.map(download_video, video_list)  # 用时14s
    # for div in video_list: # 用时15s
    #     download_video(div)
    end_time = time.time()
    print('全部完成,共耗时:%ds' % (end_time - start_time))