Python图片抓取

144 阅读2分钟

背景

用户反馈相册滑动卡顿,推测是由于用户相册里面的照片太多,于是就想做一个压测实验,导入上万张图片到本地相册去做测试。问题是去哪里弄这么多图片,于是想到了用python批量抓取图片下载到本机

实现

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def download_images(url):
    # 发送 GET 请求获取网页内容
    response = requests.get(url)
    # 使用 BeautifulSoup 解析网页内容
    soup = BeautifulSoup(response.text, "html.parser")

    # 创建保存图片的目录
    os.makedirs("img", exist_ok=True)

    # 获取所有的 <img> 标签
    img_tags = soup.find_all("img")

    # 遍历每个 <img> 标签,下载并保存图片
    for img_tag in img_tags:
        # 获取图片的 URL
        img_url = urljoin(url, img_tag["src"])
        # 检查图片扩展名是否为 jpg、jpeg 或 png
        if img_url.endswith((".jpg", ".jpeg", ".png")):
            # 发送 GET 请求下载图片
            img_response = requests.get(img_url)
            # 提取图片文件名
            img_filename = os.path.basename(img_url)
            # 拼接保存路径
            save_path = os.path.join("img", img_filename)
            # 保存图片到本地
            with open(save_path, "wb") as img_file:
                img_file.write(img_response.content)
                print(f"Downloaded image: {img_filename}")

# 指定要爬取的网页 URL
url = "https://www.vcg.com/creative/1361474534"
# 调用函数开始下载图片
download_images(url)

百度图片

import re
import time
import requests
import os

word = '天空'
queryWord = 'ok'
url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=12009735572442623815&ipn=rj&ct=201326592&is=&fp=result&fr=&word='+word+'&cg=star&queryWord=' + queryWord + '&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn=1000&rn=1000&gsm=1e&1678976552414='

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}

try:
    response = requests.get(url, headers=headers)
    html = response.text
    image_urls = re.findall('"thumbURL":"(.*?)"', html)

    os.makedirs('img', exist_ok=True)  # 创建img目录

    for i, image_url in enumerate(image_urls):
        image_response = requests.get(image_url, headers=headers)
        image = image_response.content
        timestamp = int(time.time())
        file_name = f"img/{timestamp}_{i}.jpg"  # 保存到img目录下
        print(f"爬取第{i}张图片成功")
        with open(file_name, 'wb') as f:
            f.write(image)

    print(f"爬取{len(image_urls)}张图片成功")
except requests.RequestException as e:
    print("请求异常:", e)
except IOError as e:
    print("保存图片异常:", e)