【part01】python爬虫之好看视频爬取

345 阅读2分钟

爬取数据源代码如下,如需使用,请切换自己的文件夹以及cookie,代码作用,根据好看视频首页分类爬取视频,以视频标题命名,按视频分类进行保存,增量爬虫,重复视频不会下载

笔记:

  1. get请求参数构建,使用''.format(),配合for 循环或者while 实现增量爬取
  2. 文件处理,根据视频分类创建文件夹,读取已下载的视频名称,爬取时判断如果重复则暂停下载,需要用到os.mkdir,os.path.join,os.path.exists,os.listdir等os相关函数
  3. 循环函数使用,for i,k in enumerate(list),for t,k in zip(list) 的用法
  4. 多线程的使用,
with ThreadPoolExecutor(50) as t: # 创建50个线程,对视频进行下载
    for item in videos:
        title = item[0]
        play_url = item[1]
        time.sleep(random.uniform(0, 0.5))
        t.submit(download, title=title,play_url=play_url,headers=headers,path=path)
  1. 待加强的点,异常分析,代理池使用
try:

except:
import requests
import re
import time
import os
from concurrent.futures import ThreadPoolExecutor
import random

'''
/:*?"<>| 不能作为文件名
'''


def file_path(path): # 创建文件夹函数
    path = path
    if not os.path.exists(path):
        os.mkdir(path)


def get_url(url,headers,params): # 获取访问主页,获取视频标题&下载地址
    time.sleep(random.uniform(1, 2))
    video_list = []
    res = requests.get(url=url, headers=headers,params=params,verify=False,timeout=5)
    res.encoding = 'utf-8'
    # print(res.status_code,res.url)
    # print(res.text)
    videos_data = res.json()['data']['response']['videos']
    #     videos = res.json()['data']['results']
    #     ctime = res.json()['data']['ctime']
    for data in videos_data:
        title = data['title']
        detail_url = data['play_url']
        need_list = [title,detail_url]
        video_list.append(need_list)
    # print(video_list)
    return video_list



def download(title,play_url,headers,path): # 下载视频
    time.sleep(random.uniform(1, 5))
    name = re.sub(r'[/\:*?"<>|\n\t]', "", title) + '.mp4'
    videos_list = os.listdir(path)
    if name in videos_list:
        print(f'已存在')
    else:
        print(f'正在下载==>:{title}')
        video_content = requests.get(play_url, headers=headers).content
        with open(path +"/" +name, 'wb') as fp:
            fp.write(video_content)
        print(f'下载完成==>:{title}')



def main():
    while 1:
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
            'cookie': 'cookie',
            'referer': 'https://haokan.baidu.com/tab/yingshi_new?sfrom=recommend'
            }

        time_now = int(time.time()) * 10000 # 构建参数 shuaxin_id
        class_items = ['yingshi', 'yinyue', 'youxi', 'gaoxiao', 'zongyi', 'yule', 'dongman', 'shenghuo', 'guangchangwu',
                       'meishi', 'chongwu', 'sannong', 'junshi', 'shehui', 'tiyu', 'junshi', 'shishang',
                       'qiche', 'qinzi', 'wenhua', 'lvyou', 'miaodong'] # 参数列表 主页分类


        for i_tab in class_items:
            path = os.path.join(f"E:/好看视频/{i_tab}")
            file_path(path)
            #             tab = p.get_pinyin(item, '') + '_new'
            tab = i_tab + '_new'


            time_now = int(time.time()) * 10000
            parm = {
                'tab': tab,
                'act': 'pcFeed',
                'pd': 'pc',
                'num': '50',
                'shuaxin_id': time_now,
            }   # 构建网站的请求参数
            # main_url = f'https://haokan.baidu.com/web/video/feed?tab={tab}&act=pcFeed&pd=pc&num=18&shuaxin_id={time_now}'
            main_url = 'https://haokan.baidu.com/web/video/feed'
            videos = get_url(main_url,headers=headers,params=parm)
            print(f'正在访问{i_tab}')

            # 记录线程开始时间
            t1 = time.time()
            num = 0
            with ThreadPoolExecutor(50) as t: # 创建50个线程,对视频进行下载
                for item in videos:
                    title = item[0]
                    play_url = item[1]
                    time.sleep(random.uniform(0, 0.5))
                    t.submit(download, title=title,play_url=play_url,headers=headers,path=path)
            t2 = time.time()
            print(f'耗时:{t2 - t1}秒,一共有{len(os.listdir(path))}个视频',end="\n\n\n")


if __name__ == "__main__":
    main()