导入需要的模块
import os.path
from DrissionPage import ChromiumPage
from DrissionPage.easy_set import set_paths
os.system("pip install loguru lxml pandas")
import time
import pandas as pd
from loguru import logger
import requests
from lxml import etree
下载视频
def download_video(url_list, save_name):
if url_list:
req = requests.get(url_list[0].replace("//", "http://"))
logger.info({"Downloading video": save_name})
with open(save_name, "wb") as f:
f.write(req.content)
获取下载链接进行下载
def get_download_link(url):
if url.startswith("//"):
url = "http:" + url
if "https://www.xxx.com" not in url:
url = "https://www.xxx.com" + url
logger.info({"get url": url})
file_name = url.split("/")[-1][:15]
save_name = os.path.join(save_download_video_path, "{}.mp4".format(file_name))
if os.path.exists(save_name):
logger.info({"already save": save_name})
else:
page.get(url)
time.sleep(1)
logger.info({"run": url})
logger.info({"wait for loading": "10s"})
time.sleep(10)
root = etree.HTML(page.html)
res = root.xpath("//video//source/@src")
logger.info({"start downloading url": res})
download_video(res, save_name)
创建浏览器操作实例,打开网页,等待3秒进行下拉,下拉n次
def create_selenium_driver(url):
page.get(url)
logger.info({"wait for loading": "3s", "msg": "等待加载网页信息"})
time.sleep(3)
for i in range(scroll_times):
page.scroll.down(200)
time.sleep(1)
page_source = page.html
root = etree.HTML(page_source)
res = root.xpath("//a/@href")
for i in res:
if "video" in i:
logger.info({"get a video url": i})
get_download_link(i)
其他基本参数
driver_path = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"
set_paths(browser_path=driver_path)
page = ChromiumPage()
page.get("https://www.xxx.com/")
save_download_video_path = "./video"
if not os.path.exists(save_download_video_path):
os.makedirs(save_download_video_path)
try:
scroll_times = int(input("往下拉多少次,每次会自动拉200像素 输入数字即可 例如:10>>>"))
except Exception as e:
logger.error({"输入有误":str(e)})
scroll_times = 10
else:
while True:
res = input("please input a url")
if res == "q":
break
if "video" in res:
logger.info({"get a video url": i})
get_download_link(i)
else:
create_selenium_driver(res)
page.quit()