python爬虫实战:爬取优美图库美女图片
最经发现了一个美女素材网站,想一下全都下载下来有太麻烦,有没有点一下就能直接全部下载下来打方法呢,于是我就想出了自动下载的办法,爬虫!
网站链接:www.umei.cc/bizhitupian…
代码如下:
import re
import requests
import os
import threading
from queue import Queue
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
q = Queue(30)
pool = ThreadPoolExecutor(10)
headers = {'authority': 'www.umei.cc', 'method': 'GET', 'path': '/bizhitupian/meinvbizhi/', 'scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cache-control': 'no-cache', 'pragma': 'no-cache', 'referer': 'https://www.umei.cc/',
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Microsoft Edge";v="108"', 'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76'
}
def utl_image(url):
html = requests.get(url=url, headers=headers)
html.encoding = "utf-8"
my_html = html.text
# print(MyHtml)
image = re.findall('<a href="(.*?)"><img class="lazy" data-original=".*?".*?alt=".*?" /></a>', my_html)
title = re.findall('<a href=".*?"><img class="lazy" data-original=".*?".*?alt="(.*?)" /></a>', my_html)
image_title = zip(image, title)
for url_image, title in image_title:
q.put([url_image, title])
def image_save():
while True:
url_image, title = q.get(timeout=5)
# print(url, title)
date = 'https://www.umei.cc' + url_image
html_url = requests.get(url=date, headers=headers)
html_url.encoding = 'utf-8'
photo = (re.findall('<div class="big-pic"><a href=".*?"><img alt=".*?" src="(.*?)".*?</div>', html_url.text))
for resp in tqdm(photo,desc=title):
# print(resp)
resp = requests.get(url=resp, headers=headers) # 获取网页信息
f = open(f"image/{title}.jpg", 'wb')
f.write(resp.content)
# print(f'保存成功{resp}')
f.close()
if __name__ == '__main__':
url = ["https://www.umei.cc/bizhitupian/meinvbizhi/"]
url_list = url + [f'https://www.umei.cc/bizhitupian/meinvbizhi/index_{i}.htm' for i in range(2, 10)]
for url in url_list:
pool.submit(utl_image, url)
for i in range(5):
t = threading.Thread(target=image_save)
t.start()
运行结果: