Python爬取和下载网站所有图片

125 阅读1分钟

主要功能

  1. 使用 requests 库爬取网页代码
  2. 使用正则表达式找到 img 标签的 url
  3. 遍历和处理 url 地址, 最后逐个下载

运行截图

image.png

下载的图片

image.png

代码实现

import requests
import re
from pathlib import Path
from datetime import datetime
from urllib.parse import urlparse, urljoin
import time

# 模拟浏览器请求, 防止被禁止爬取
headers = {
    "method": "GET",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-language": "zh-CN,zh;q=0.9",
    "cache-control": "no-cache",
    "pragma": "no-cache",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}


def request():
    # 获取页面
    response = requests.get(url, headers=headers)
    html = response.text
    # print('html:\n', html)
    # 找到img标签
    return re.findall('<img .*?src=["'](.*?)["'].*?>', html)


def startDownload(urls, dirName):
    # 如果保存图片的目录不存在就创建目录
    path = Path(dirName)
    if not path.exists():
        path.mkdir(parents=True, exist_ok=True)

    curIndex = 0
    for imgUrl in urls:
        print('imgUrl', imgUrl)
        curIndex += 1

        filename = datetime.now().strftime('%Y-%m-%d %H_%M_%S') + '-' + str(curIndex) + '.jpg'

        # 图片路径可能有这三种, 兼容处理
        # <p><img src="//www.xxx.com/uploads/test.png" alt="image.png"></p>
        # <p><img src="/uploads/test.png" alt="image.png"></p>
        # <p><img src="https://www.xxx.com/uploads/test.png" alt="image.png"></p>

        if imgUrl.startswith('//'):
            imgUrl = urlparse(url).scheme + ':' + imgUrl
            print('imgUrl添加http(s)前缀', imgUrl)
        elif imgUrl.startswith('/'):
            imgUrl = urljoin(url, imgUrl)
            print('imgUrl添加hostname', imgUrl)

        print("fileName:", filename)
        response = requests.get(imgUrl, headers=headers)  # 发请求获取图片
        with open(dirName + '/' + filename, 'wb') as f:
            f.write(response.content)
        time.sleep(1)

# 爬取百度首页所有图片 (不包括CSS background-image 图片)
url = 'https://www.baidu.com/'
saveDirPath = './downloadImg' # 保存的文件夹路径
urls = request()

if len(urls) == 0:
    print('该页面没有img标签')
else:
    print('img标签总数:', len(urls))
    print('urls', urls)
    startDownload(urls, saveDirPath)