python爬虫实战:爬取优美图库美女图片

347 阅读1分钟

python爬虫实战:爬取优美图库美女图片

最经发现了一个美女素材网站,想一下全都下载下来有太麻烦,有没有点一下就能直接全部下载下来打方法呢,于是我就想出了自动下载的办法,爬虫!

网站链接:www.umei.cc/bizhitupian…

1.1.png

代码如下:

import re
import requests
import os
import threading
from queue import Queue
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
q = Queue(30)
pool = ThreadPoolExecutor(10)

headers = {'authority': 'www.umei.cc', 'method': 'GET', 'path': '/bizhitupian/meinvbizhi/', 'scheme': 'https',
           'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
           'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
           'cache-control': 'no-cache', 'pragma': 'no-cache', 'referer': 'https://www.umei.cc/',
           'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Microsoft Edge";v="108"', 'sec-ch-ua-mobile': '?0',
           'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate',
           'sec-fetch-site': 'same-origin', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1',
           'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76'
           }


def utl_image(url):
    html = requests.get(url=url, headers=headers)
    html.encoding = "utf-8"
    my_html = html.text
    # print(MyHtml)
    image = re.findall('<a href="(.*?)"><img class="lazy" data-original=".*?".*?alt=".*?" /></a>', my_html)
    title = re.findall('<a href=".*?"><img class="lazy" data-original=".*?".*?alt="(.*?)" /></a>', my_html)
    image_title = zip(image, title)
    for url_image, title in image_title:
        q.put([url_image, title])


def image_save():
    while True:
        url_image, title = q.get(timeout=5)
        # print(url, title)
        date = 'https://www.umei.cc' + url_image
        html_url = requests.get(url=date, headers=headers)
        html_url.encoding = 'utf-8'
        photo = (re.findall('<div class="big-pic"><a href=".*?"><img alt=".*?" src="(.*?)".*?</div>', html_url.text))
        for resp in tqdm(photo,desc=title):
            # print(resp)
            resp = requests.get(url=resp, headers=headers)  # 获取网页信息
            f = open(f"image/{title}.jpg", 'wb')
            f.write(resp.content)
            # print(f'保存成功{resp}')
            f.close()


if __name__ == '__main__':
    url = ["https://www.umei.cc/bizhitupian/meinvbizhi/"]
    url_list = url + [f'https://www.umei.cc/bizhitupian/meinvbizhi/index_{i}.htm' for i in range(2, 10)]
    for url in url_list:
        pool.submit(utl_image, url)
    for i in range(5):
        t = threading.Thread(target=image_save)
        t.start()

运行结果:

2.png