背景
用户反馈相册滑动卡顿,推测是由于用户相册里面的照片太多,于是就想做一个压测实验,导入上万张图片到本地相册去做测试。问题是去哪里弄这么多图片,于是想到了用python批量抓取图片下载到本机
实现
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def download_images(url):
# 发送 GET 请求获取网页内容
response = requests.get(url)
# 使用 BeautifulSoup 解析网页内容
soup = BeautifulSoup(response.text, "html.parser")
# 创建保存图片的目录
os.makedirs("img", exist_ok=True)
# 获取所有的 <img> 标签
img_tags = soup.find_all("img")
# 遍历每个 <img> 标签,下载并保存图片
for img_tag in img_tags:
# 获取图片的 URL
img_url = urljoin(url, img_tag["src"])
# 检查图片扩展名是否为 jpg、jpeg 或 png
if img_url.endswith((".jpg", ".jpeg", ".png")):
# 发送 GET 请求下载图片
img_response = requests.get(img_url)
# 提取图片文件名
img_filename = os.path.basename(img_url)
# 拼接保存路径
save_path = os.path.join("img", img_filename)
# 保存图片到本地
with open(save_path, "wb") as img_file:
img_file.write(img_response.content)
print(f"Downloaded image: {img_filename}")
# 指定要爬取的网页 URL
url = "https://www.vcg.com/creative/1361474534"
# 调用函数开始下载图片
download_images(url)
百度图片
import re
import time
import requests
import os
word = '天空'
queryWord = 'ok'
url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=12009735572442623815&ipn=rj&ct=201326592&is=&fp=result&fr=&word='+word+'&cg=star&queryWord=' + queryWord + '&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn=1000&rn=1000&gsm=1e&1678976552414='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}
try:
response = requests.get(url, headers=headers)
html = response.text
image_urls = re.findall('"thumbURL":"(.*?)"', html)
os.makedirs('img', exist_ok=True) # 创建img目录
for i, image_url in enumerate(image_urls):
image_response = requests.get(image_url, headers=headers)
image = image_response.content
timestamp = int(time.time())
file_name = f"img/{timestamp}_{i}.jpg" # 保存到img目录下
print(f"爬取第{i}张图片成功")
with open(file_name, 'wb') as f:
f.write(image)
print(f"爬取{len(image_urls)}张图片成功")
except requests.RequestException as e:
print("请求异常:", e)
except IOError as e:
print("保存图片异常:", e)