使用Python(爬虫)自动下载图片|Python 主题月

2,034 阅读2分钟

本文正在参加「Python主题月」,详情查看 活动链接

准备

何为爬虫:网络爬虫(又称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或者蠕虫。百度百科详情

随着大数据时代的到来,人们对数据资源的需求越来越多,而爬虫是一种很好的自动采集数据的手段。推荐一个Python网络爬虫学习路线解读

分享几个Python学习连接:

1.请叫我汪海 的CSDN

2.廖雪峰大佬的教程廖雪峰大佬的视频版教程

3.爬虫小白入门

4.爬虫框架 Scrapy

image.png

开始干活

官网下载Python

安装完成

image.png

搜索一份爬图片的代码

前人种树,后人乘凉,感谢!

image.png

安装依赖

使用pip install ***安装对应依赖

pip install urllib

我这使用的时候提示需要升级pip python -m pip install --upgrade pip

运行项目

python main.py

image.png

image.png

运行,发现成功下载图片。

附上代码和代码地址

动手试试有惊喜或者惊喜(Practice make perfect!)。

github代码

gitee代码

# -*- coding:utf-8 -*-
import os
import random
import ssl
import time
import urllib.request

from bs4 import BeautifulSoup

# 请求头配置
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
# 下载地址
BASE_URL = "https://www.mzitu.com"
# 保存图片文件夹地址
BASE_DIR = "../images"


def start_work(serial_id):
    picture_dir = BASE_DIR + os.sep + serial_id
    if not os.path.exists(picture_dir):
        os.mkdir(picture_dir)
    page_count = get_page_count(serial_id)
    print("%s 共%d个图片" % (serial_id, page_count))
    get_image_for_serial(picture_dir,serial_id,page_count)

# 获取页数
def get_page_count(serial_id):
    header = {"user-agent": USER_AGENT}
    context = ssl._create_unverified_context()
    url = "%s/%s" % (BASE_URL, serial_id)
    req = urllib.request.Request(url, headers=header)
    resp = urllib.request.urlopen(req, context=context)
    content = resp.read()
    str_content = content.decode("utf-8")
    total_count = __get_counts(str_content)
    return total_count

# 获取数量
def __get_counts(html_content):
    page_count = 0
    soup = BeautifulSoup(html_content, 'lxml')
    data = soup.select("body > div.main > div.content > div.pagenavi > a > span")
    if data and len(data) >= 3:
        page_count = int(data[-2].get_text())
    return page_count

# 获取图片地址
def get_image_url(html_content):
    soup = BeautifulSoup(html_content, 'lxml')
    data = soup.select("body > div.main > div.content > div.main-image > p > a > img")
    url = None
    try:
        url = data[0].get("src")
    except Exception as ex:
        print("exception occur:%s" % ex)
    return url

# 获取图片地址数组
def get_all_image_urls(serial_id, page_count):
    url_list=list()
    header = {"user-agent": USER_AGENT}
    context = ssl._create_unverified_context()
    if page_count <= 1:
        return

    for x in range(1,page_count+1):
        print("获取第%d张图片的地址" % x)
        url = "%s/%s/%s" % (BASE_URL, serial_id, x)
        req = urllib.request.Request(url, headers=header)
        resp = urllib.request.urlopen(req, context=context)
        content = resp.read()
        str_content = content.decode("utf-8")
        img_url = get_image_url(str_content)
        if img_url:
            url_list.append(img_url)
            print("第%d张图片地址是:%s" % (x, img_url))
        time.sleep(random.randint(1, 2))
    return url_list


# 获取图片
def get_image_for_serial(dir_path, serial_id, total_count):
    for i in range(1, total_count + 1):
        print("开始获取第%d张图片" % i)
        get_image_for_index(dir_path, serial_id, i)
        sleep_seconds = random.randint(1, 10) /10
        time.sleep(sleep_seconds)

# 获取具体图片
def get_image_for_index(dir_path, serial_id, page_index):
    header = {"user-agent": USER_AGENT}
    context = ssl._create_unverified_context()
    print("获取第%d张图片的地址" % page_index)
    ref_url = "%s/%s/%s" % (BASE_URL, serial_id, page_index)
    req = urllib.request.Request(ref_url, headers=header)
    resp = urllib.request.urlopen(req, context=context)
    content = resp.read()
    str_content = content.decode("utf-8")
    img_url = get_image_url(str_content)
    if img_url:
        print("第%d张图片地址是:%s" % (page_index, img_url))
        print("尝试保存图片%s" % img_url)
        save_img(dir_path, img_url, ref_url)

# 保存图片
def save_imgs(dir_path, img_urls):
    for img_addr in img_urls:
        save_img(dir_path, img_addr)

# 保存具体图片
def save_img(dir_path, img_url, ref_url):
    header = {
        "user-agent": USER_AGENT,
        "Referer": ref_url
    }
    context = ssl._create_unverified_context()
    req = urllib.request.Request(img_url, headers=header)
    resp = urllib.request.urlopen(req, context=context)
    content = resp.read()
    with open(dir_path+os.sep+img_url.split('/')[-1], 'wb') as f:
        f.write(content)
        f.close()
        print("已向目录:%s 保存文件:%s" % (dir_path, img_url.split('/')[-1]))
    time.sleep(random.randint(1, 2))


if __name__ == "__main__":
    vol_list = ["204061"]
    for serial_id in vol_list:
        start_work(serial_id)

小结

我学会了Python嘛?并没有!!我只是成功安装了python,然后成功运行了一个用例,算是爬虫的一个实践。我现在会的只是一些基础类型和函数。

image.png