爬取数据源代码如下,如需使用,请切换自己的文件夹以及cookie,代码作用,根据好看视频首页分类爬取视频,以视频标题命名,按视频分类进行保存,增量爬虫,重复视频不会下载
笔记:
- get请求参数构建,使用''.format(),配合for 循环或者while 实现增量爬取
- 文件处理,根据视频分类创建文件夹,读取已下载的视频名称,爬取时判断如果重复则暂停下载,需要用到os.mkdir,os.path.join,os.path.exists,os.listdir等os相关函数
- 循环函数使用,for i,k in enumerate(list),for t,k in zip(list) 的用法
- 多线程的使用,
with ThreadPoolExecutor(50) as t: # 创建50个线程,对视频进行下载
for item in videos:
title = item[0]
play_url = item[1]
time.sleep(random.uniform(0, 0.5))
t.submit(download, title=title,play_url=play_url,headers=headers,path=path)
- 待加强的点,异常分析,代理池使用
try:
except:
import requests
import re
import time
import os
from concurrent.futures import ThreadPoolExecutor
import random
'''
/:*?"<>| 不能作为文件名
'''
def file_path(path): # 创建文件夹函数
path = path
if not os.path.exists(path):
os.mkdir(path)
def get_url(url,headers,params): # 获取访问主页,获取视频标题&下载地址
time.sleep(random.uniform(1, 2))
video_list = []
res = requests.get(url=url, headers=headers,params=params,verify=False,timeout=5)
res.encoding = 'utf-8'
# print(res.status_code,res.url)
# print(res.text)
videos_data = res.json()['data']['response']['videos']
# videos = res.json()['data']['results']
# ctime = res.json()['data']['ctime']
for data in videos_data:
title = data['title']
detail_url = data['play_url']
need_list = [title,detail_url]
video_list.append(need_list)
# print(video_list)
return video_list
def download(title,play_url,headers,path): # 下载视频
time.sleep(random.uniform(1, 5))
name = re.sub(r'[/\:*?"<>|\n\t]', "", title) + '.mp4'
videos_list = os.listdir(path)
if name in videos_list:
print(f'已存在')
else:
print(f'正在下载==>:{title}')
video_content = requests.get(play_url, headers=headers).content
with open(path +"/" +name, 'wb') as fp:
fp.write(video_content)
print(f'下载完成==>:{title}')
def main():
while 1:
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'cookie': 'cookie',
'referer': 'https://haokan.baidu.com/tab/yingshi_new?sfrom=recommend'
}
time_now = int(time.time()) * 10000 # 构建参数 shuaxin_id
class_items = ['yingshi', 'yinyue', 'youxi', 'gaoxiao', 'zongyi', 'yule', 'dongman', 'shenghuo', 'guangchangwu',
'meishi', 'chongwu', 'sannong', 'junshi', 'shehui', 'tiyu', 'junshi', 'shishang',
'qiche', 'qinzi', 'wenhua', 'lvyou', 'miaodong'] # 参数列表 主页分类
for i_tab in class_items:
path = os.path.join(f"E:/好看视频/{i_tab}")
file_path(path)
# tab = p.get_pinyin(item, '') + '_new'
tab = i_tab + '_new'
time_now = int(time.time()) * 10000
parm = {
'tab': tab,
'act': 'pcFeed',
'pd': 'pc',
'num': '50',
'shuaxin_id': time_now,
} # 构建网站的请求参数
# main_url = f'https://haokan.baidu.com/web/video/feed?tab={tab}&act=pcFeed&pd=pc&num=18&shuaxin_id={time_now}'
main_url = 'https://haokan.baidu.com/web/video/feed'
videos = get_url(main_url,headers=headers,params=parm)
print(f'正在访问{i_tab}')
# 记录线程开始时间
t1 = time.time()
num = 0
with ThreadPoolExecutor(50) as t: # 创建50个线程,对视频进行下载
for item in videos:
title = item[0]
play_url = item[1]
time.sleep(random.uniform(0, 0.5))
t.submit(download, title=title,play_url=play_url,headers=headers,path=path)
t2 = time.time()
print(f'耗时:{t2 - t1}秒,一共有{len(os.listdir(path))}个视频',end="\n\n\n")
if __name__ == "__main__":
main()