Python多线程爬虫

169 阅读1分钟

Python多线程爬虫

需求教程(旧版):bilibili传送门
完整代码

import requests
from lxml import etree
import re
import os
from multiprocessing.dummy import Pool
# 保存视频
def get_video(video_all):
    print(video_all['title'],'正在下载...')
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
    }
    response = requests.get(video_all['true_url'],headers = headers)
    mp4 = response.content
    code = response.status_code
    with open('.\pearvideo\%s.mp4'%video_all['title'],'wb') as fp:
        fp.write(mp4)
        print(video_all['title'],'响应码:',code,end=" ")
        if code == 200:
            print('视频保存成功')
        else:
            print('视频保存失败')
# 创建文件夹
if not os.path.exists("./pearvideo"):
    os.mkdir("./pearvideo")
# 获取主页面
url = 'https://www.pearvideo.com/category_5'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}
text = requests.get(url,headers = headers).text
tree = etree.HTML(text)
li_list = tree.xpath('//*[@id="listvideoListUl"]/li/div/a/@href')
print(li_list)
video_list = []
for li in li_list:
    # 在XHR中找到json()数据,其中包含mp4的url
    ex = 'video_([\d]*)'
    v_li = re.findall(ex,li)[0]
    v_url = 'https://www.pearvideo.com/videoStatus.jsp'
    v_headers = {
        'Referer':'https://www.pearvideo.com/'+li,
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
    }
    v_param = {
        'contId':v_li
    }
    v_text = requests.get(v_url,params = v_param,headers = v_headers).json()
    mp4_url = v_text['videoInfo']['videos']['srcUrl']
    '''
    mp4_url
    https://video.pearvideo.com/mp4/adshort/20210201/1612171096323-15594744_adpkg-ad_hd.mp4
    true_url
    'https://video.pearvideo.com/mp4/adshort/20210201/cont-1718799-15594744_adpkg-ad_hd.mp4
    '''
    mp4_x = 'https://video.pearvideo.com/mp4/adshort/([\d]*?)/[\d]*?-([\d]*?)_'
    mp4_re = re.findall(mp4_x,mp4_url)[0]
    mp4_date = mp4_re[0]
    mp4_id = mp4_re[1]
    true_url = 'https://video.pearvideo.com/mp4/adshort/'+mp4_date+'/cont-'+v_li+'-'+mp4_id+'_adpkg-ad_hd.mp4'
    print(true_url)
    # 获取标题
    t_url = 'https://www.pearvideo.com/'+li
    t_text = requests.get(t_url,headers = headers).text
    t_tree = etree.HTML(t_text)
    tx = '(.*?)_'
    title = re.findall(tx,t_tree.xpath('//title/text()')[0])[0]
    print(title)
    video_all = {
        'true_url':true_url,
        'title':title
    }
    video_list.append(video_all)
print()
print(video_list)
print()
# 创建线程池,调用下载函数
pool = Pool(4)
pool.map(get_video,video_list)
# 关闭线程池
pool.close()
pool.join()