主要功能
- 使用 requests 库爬取网页代码
- 使用正则表达式找到 img 标签的 url
- 遍历和处理 url 地址, 最后逐个下载
运行截图

下载的图片

代码实现
import requests
import re
from pathlib import Path
from datetime import datetime
from urllib.parse import urlparse, urljoin
import time
headers = {
"method": "GET",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "no-cache",
"pragma": "no-cache",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
}
def request():
response = requests.get(url, headers=headers)
html = response.text
return re.findall('<img .*?src=["'](.*?)["'].*?>', html)
def startDownload(urls, dirName):
# 如果保存图片的目录不存在就创建目录
path = Path(dirName)
if not path.exists():
path.mkdir(parents=True, exist_ok=True)
curIndex = 0
for imgUrl in urls:
print('imgUrl', imgUrl)
curIndex += 1
filename = datetime.now().strftime('%Y-%m-%d %H_%M_%S') + '-' + str(curIndex) + '.jpg'
# 图片路径可能有这三种, 兼容处理
# <p><img src="//www.xxx.com/uploads/test.png" alt="image.png"></p>
# <p><img src="/uploads/test.png" alt="image.png"></p>
# <p><img src="https://www.xxx.com/uploads/test.png" alt="image.png"></p>
if imgUrl.startswith('//'):
imgUrl = urlparse(url).scheme + ':' + imgUrl
print('imgUrl添加http(s)前缀', imgUrl)
elif imgUrl.startswith('/'):
imgUrl = urljoin(url, imgUrl)
print('imgUrl添加hostname', imgUrl)
print("fileName:", filename)
response = requests.get(imgUrl, headers=headers) # 发请求获取图片
with open(dirName + '/' + filename, 'wb') as f:
f.write(response.content)
time.sleep(1)
# 爬取百度首页所有图片 (不包括CSS background-image 图片)
url = 'https://www.baidu.com/'
saveDirPath = './downloadImg' # 保存的文件夹路径
urls = request()
if len(urls) == 0:
print('该页面没有img标签')
else:
print('img标签总数:', len(urls))
print('urls', urls)
startDownload(urls, saveDirPath)