Python爬取和下载网站所有图片使用 requests 库爬取前端网页代码, 找到所有 img 标签的 url , 针

主要功能

使用 requests 库爬取网页代码
使用正则表达式找到 img 标签的 url
遍历和处理 url 地址, 最后逐个下载

运行截图

下载的图片

代码实现

import requests
import re
from pathlib import Path
from datetime import datetime
from urllib.parse import urlparse, urljoin
import time

# 模拟浏览器请求, 防止被禁止爬取
headers = {
    "method": "GET",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-language": "zh-CN,zh;q=0.9",
    "cache-control": "no-cache",
    "pragma": "no-cache",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}


def request():
    # 获取页面
    response = requests.get(url, headers=headers)
    html = response.text
    # print('html:\n', html)
    # 找到img标签
    return re.findall('<img .*?src=["'](.*?)["'].*?>', html)


def startDownload(urls, dirName):
    # 如果保存图片的目录不存在就创建目录
    path = Path(dirName)
    if not path.exists():
        path.mkdir(parents=True, exist_ok=True)

    curIndex = 0
    for imgUrl in urls:
        print('imgUrl', imgUrl)
        curIndex += 1

        filename = datetime.now().strftime('%Y-%m-%d %H_%M_%S') + '-' + str(curIndex) + '.jpg'

        # 图片路径可能有这三种, 兼容处理
        # <p><img src="//www.xxx.com/uploads/test.png" alt="image.png"></p>
        # <p><img src="/uploads/test.png" alt="image.png"></p>
        # <p><img src="https://www.xxx.com/uploads/test.png" alt="image.png"></p>

        if imgUrl.startswith('//'):
            imgUrl = urlparse(url).scheme + ':' + imgUrl
            print('imgUrl添加http(s)前缀', imgUrl)
        elif imgUrl.startswith('/'):
            imgUrl = urljoin(url, imgUrl)
            print('imgUrl添加hostname', imgUrl)

        print("fileName:", filename)
        response = requests.get(imgUrl, headers=headers)  # 发请求获取图片
        with open(dirName + '/' + filename, 'wb') as f:
            f.write(response.content)
        time.sleep(1)

# 爬取百度首页所有图片 (不包括CSS background-image 图片)
url = 'https://www.baidu.com/'
saveDirPath = './downloadImg' # 保存的文件夹路径
urls = request()

if len(urls) == 0:
    print('该页面没有img标签')
else:
    print('img标签总数:', len(urls))
    print('urls', urls)
    startDownload(urls, saveDirPath)