Python爬虫抓取网络照片

113 阅读2分钟

分析url规律

打开猫咪图片网址。其 url 规律如下:

第一页:https://www.huiyi8.com/tupian/tag-%E7%8C%AB%E5%92%AA/1.html
第二页:https://www.huiyi8.com/tupian/tag-%E7%8C%AB%E5%92%AA/2.html
第三页:https://www.huiyi8.com/tupian/tag-%E7%8C%AB%E5%92%AA/3.html
第三页:https://www.huiyi8.com/tupian/tag-%E7%8C%AB%E5%92%AA/n.htm

安装一下库

pip install BeautifulSoup4 
pip install requests 
pip install urllib3 
pip install lxml

爬取图片代码:

from bs4 import BeautifulSoup
import requests
import urllib.request
import os

# 第一页猫咪图片网址
url = 'https://www.huiyi8.com/tupian/tag-%E7%8C%AB%E5%92%AA/1.html'
# 图片保存路径,这里 r 表示不转义
path = r"/Users/xxx/Downloads/cats/"
# 判断目录是否存在,存在则跳过,不存在则创建
if os.path.exists(path):
    pass
else:
    os.mkdir(path)


# 获得所有猫咪图片网页地址
def allpage():
    all_url = []
    # 循环翻页次数 20 次
    for i in range(1, 20):
        # 替换翻页的页数,这里的 [-6] 是指网页地址倒数第 6 位
        each_url = url.replace(url[-6], str(i))
        # 将所有获取的 url 加入 all_url 数组
        all_url.append(each_url)
    # 返回所有获取到的地址
    return all_url


# 主函数入口
if __name__ == '__main__':
    # 调用 allpage 函数获取所有网页地址
    img_url = allpage()
    cookies = {
    "t": "c41f9a1b34c87907a0df5375c5f0801b",
    "r": "8507",
    "Hm_lvt_4c65a21638f96d83db9e42a8df2772a9": "1703583082",
    "Hm_lpvt_4c65a21638f96d83db9e42a8df2772a9": "1703583652"
}
    headers = {
    "authority": "www.huiyi8.com",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,ak;q=0.7,es;q=0.6",
    "cache-control": "max-age=0",
    "referer": "https://www.huiyi8.com/tupian/tag-%E7%8C%AB%E5%92%AA/1.html",
    "sec-ch-ua": "\"Not_A Brand\";v=\"8\", \"Chromium\";v=\"120\", \"Google Chrome\";v=\"120\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"macOS\"",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
    for url in img_url:
        # 获得网页源代码
        requ = requests.get(url,cookies=cookies,headers=headers)
        req = requ.text.encode(requ.encoding).decode()
        print(req)
        html = BeautifulSoup(req, 'lxml')
        # 添加一个 url 数组
        img_urls = []
        # 获取 html 中所有 img 标签的内容
        for img in html.find_all('img'):
            #  筛选匹配 src 标签内容以 http 开头,以 jpg 结束
            if img["src"].startswith('http') and img["src"].endswith("jpg"):
                # 将符合条件的 img 标签加入 img_urls 数组
                img_urls.append(img)
        # 循环数组中所有 src
        for k in img_urls:
            # 获取图片 url
            img = k.get('src')
            # 获取图片名称,强制类型转换很重要
            name = str(k.get('alt'))
            type(name)
            # 给图片命名
            file_name = path + name + '.jpg'
            html = requests.get(url=img,headers=headers,cookies=cookies).content
            with open(file_name,'wb') as f:
             f.write(html)
            # 打印爬取的图片
            print(img, file_name)

爬取结果

image.png

image.png