用BS4的方法爬取精美壁纸保存到本地,爬取10页内容

268 阅读1分钟
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import csv
import time
import os
import urllib.request
# html.parser  指定为html内容

# dir = os.getcwd() #当前目录
img_csv = open("img_download.csv", mode='w')
img_writer = csv.writer(img_csv)
a = 1
num = 0
while a <= 10:
    if a == 1:
        url = "https://umei.cc/bizhitupian/weimeibizhi"
        # heads = {
        #     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
        # }
        resp = requests.get(url)
        resp.encoding = 'UTF-8'
        resp_content = resp.text
        page1 = BeautifulSoup(resp_content, "html.parser")
        page2 = page1.find("div",class_="TypeList").find_all("a")
        for s in page2:
            img_url = s.get('href')
            img_list = ['https://umei.cc/'+img_url.strip('/')]
            for ss in img_list:
                resp_content1 = requests.get(ss)
                resp_content1.encoding = 'utf-8'
                page3 = resp_content1.text
                # print(page3)
                page4 = BeautifulSoup(page3, "html.parser")
                page5 = page4.find("div", class_="ImageBody").find_all('img')
                for ss in page5:
                    img_result = ss.get('src')
                    #开始下载图片
                    img = requests.get(img_result)      # 拿到图片数据
                    img_name = '精美壁纸{}.jpg'.format(num)     #给图片一个名字
                    with open("img/"+img_name,mode='wb') as f:      #开始下载图片,保存在img/目录下
                        f.write(img.content)
                        print(img_name,'下载完成')
                        time.sleep(1)
                        num += 1
                    # print(img_result)
                    # img_writer.writerow([img_result])
                    # print('完成1')
                    # urllib.request.urlretrieve(img_result, dir, 'wb')   #下载功能待补充
        a += 1
# 因为url第一页的地址没有index参数,第二页开始有index参数所以要分2步拿数据,开始爬取第二页之后的数据
    if a != 1:
        urladd = "https://umei.cc/bizhitupian/weimeibizhi/index_{}.htm".format(a)
        resp = requests.get(urladd)
        resp.encoding = 'UTF-8'
        resp_content = resp.text
        page1add = BeautifulSoup(resp_content, "html.parser")
        page2add = page1add.find("div", class_="TypeList").find_all("a")
        for ii in page2add:
            img_urladd = ii.get('href')
            img_listadd = ['https://umei.cc/' + img_urladd.strip('/')]
            for sss in img_listadd:
                resp_content1add = requests.get(sss)
                resp_content1add.encoding = 'utf-8'
                page3add = resp_content1add.text
                # print(page3)
                page4add = BeautifulSoup(page3add, "html.parser")
                page5add = page4add.find("div", class_="ImageBody").find_all('img')
                for iii in page5add:
                    img_resultadd = iii.get('src')
                    img2 = requests.get(img_resultadd)
                    img2name = '精美壁纸{}.jpg'.format(num)
                    with open('img/'+img2name,mode='wb') as f:
                        f.write(img2.content)
                        print(img2name,'下载完成')
                        time.sleep(1)
                        num += 1

                    # print(img_resultadd)
                    # img_writer.writerow([img_resultadd])
                    # print('完成2')


    a += 1


print('over!')
img_csv.close()
resp.close()