本文已参与「新人创作礼」活动，一起开启掘金创作之路。

我这几天偶然发现之前一些网站的高清图址有迹可循，想着用异步和selenium写个爬虫，多少我今天一看，之前没看到，高清图址就在一段xpath中我竟然没发现，所以，今天赶紧写了一个可以爬取固定页面的爬虫，就是4k风景图，4k动漫等等，就只会爬该页1-n页，n是自己定义，想抓多少抓多少，不过我劝小伙伴们善良，网站已经加了验证码，难度上升了，我也加了反爬反制手段，如果后来实在不行，我会手写个图像识别。那么，高清大图爬取源代码奉上，如果有帮助，希望小伙伴们点个赞再走，感谢！

import sys
import time
import os
import requests
import re  # 正则表达式，进行文字匹配
from bs4 import BeautifulSoup  # (网页解析，获取数据)
import urllib.request, urllib.error  # 制定URL，获取网页数据，urllib.request urllib.error
import sqlite3
import random

# UA_LIST = ['Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)', 'Mozilla/5.0 (compatible; U; ABrowse 0.6;  Syllable) AppleWebKit/420+ (KHTML, like Gecko)', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR   3.5.30729)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0;   Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;   SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Avant Browser)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1;   .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)', 'Mozilla/4.0 (compatible; Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729); Windows NT 5.1; Trident/4.0)', 'Mozilla/4.0 (compatible; Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB6; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727); Windows NT 5.1; Trident/4.0; Maxthon; .NET CLR 2.0.50727; .NET CLR 1.1.4322; InfoPath.2)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB6; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)']
'''测试通过，解码方式为gbk'''
'''k = int(input('输入页面数'))
for i in range(2, k+1):
    ip = random.randint(0, 12)
    headers = {
        'user-agent': '自己的ua'}

    url = 'https://pic.netbian.com/4kdongman/index_{}.html'.format(i)
    response = requests.get(url, headers=headers)
    response.encoding = 'gbk'
    html = response.text
    url_2 = re.findall('<li><a href="(.*?)" target="_blank"><img src=".*?" alt=".*?" /><b>.*?</b></a>', html)
    print(url_2)'''
#
# url = 'https://www.3gbizhi.com/wallDM/4383.html'#自己增加format会吧，线性的，记得加sleep
k = int(input("请输入爬取页面数："))
uum = []
UA_LIST = ['Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)', 'Mozilla/5.0 (compatible; U; ABrowse 0.6;  Syllable) AppleWebKit/420+ (KHTML, like Gecko)', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)', 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR   3.5.30729)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0;   Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;   SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; SV1; Acoo Browser; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; Avant Browser)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1;   .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; GTB5; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)', 'Mozilla/4.0 (compatible; Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729); Windows NT 5.1; Trident/4.0)', 'Mozilla/4.0 (compatible; Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB6; Acoo Browser; .NET CLR 1.1.4322; .NET CLR 2.0.50727); Windows NT 5.1; Trident/4.0; Maxthon; .NET CLR 2.0.50727; .NET CLR 1.1.4322; InfoPath.2)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser; GTB6; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; InfoPath.1; .NET CLR 3.5.30729; .NET CLR 3.0.30618)']
#网上找的ua


for j in range(2, k + 1):


    ip = random.randint(1, 10)
    headers = {'user-agent': UA_LIST[ip]}
    # https://pic.netbian.com/tupian/30002.html
    u_2 = 'https://pic.netbian.com/4kdongman/index_{}.html'.format(j)
    response = requests.get(u_2, headers=headers)
    response.encoding = 'gbk'
    html = response.text

    url_2 = re.findall('<li><a href="(.*?)" target="_blank"><img src=".*?" alt=".*?" /><b>.*?</b></a>', html)

    uum += url_2
    time.sleep(0.1)

for i in range(0, len(uum)):
    ip = random.randint(1, 10)
    print('爬取中{}'.format(i))
    url = 'https://pic.netbian.com' + uum[i]
    headers = {'user-agent': UA_LIST[ip]}



    response = requests.get(url=url, headers=headers)
    response.encoding = 'gbk'
    time.sleep(0.1)  # 留点缓冲时间
    html = response.text

    # html.encode('utf-8')
    # urls = re.findall('<img lazysrc="(.*?)" lazysrc2x=".*?" height="348px" alt=".*?" title=".*?" />', html)
    urls = re.findall('<img src="(.*?)" data-pic=".*?" alt=".*?" title=".*?"></a>', html)
    filename = 'D:\点击获取资源壁纸破解\'
    print(urls)

    if not os.path.exists(filename):
        os.mkdir(filename)

    if len(urls) != 0:

        for url in urls:
            url = 'https://pic.netbian.com/' + url
            name = url.split('/')[-1]

            response = requests.get(url, headers=headers)
            
            with open(filename + name, mode='wb') as f:
                f.write(response.content)
    if i == len(uum) - 1:
        print("爬取结束了")

效果如下：

总结：

由于网站有反爬，我找了11个user-agent，放在list里，取随机数IP，作为list下标，加入headers的字典里，伪装headers，同时如果还不行，可以在headers上加入网站要获取的内容，就拿gzip压缩文本资源举例，我们要加上像 'accept-language': 'zh-CN', 'content-type': 'application/json;charset=UTF-8' 这种格式的内容，如果是post获取网页，可能还需要包装加密信息同时，由于我是同步爬取，又加了time.sleep()函数，所以获取图片的速度会慢一点，为了保险起见，必须设置time.sleep(),否则一些小网站爬到太快服务器会承受不住，自我感受就是通过这次的爬虫经历，使我进一步了解pathon的语法的使用，让我学会了如何去爬取自己所需要的数据。同时，我还可以将所爬到的数据进行数据分析，取标题放进字典，方便以后做seo搜索使用，学习爬虫也需要了解一定的前端知识，像css选择器，xpath,js脚本等等

某网站二次元美女图片爬取加破解（plus版） python

本文已参与「新人创作礼」活动，一起开启掘金创作之路。

总结：