python3 Requests-Redis爬虫

87 阅读2分钟

本文已参与「新人创作礼」活动,一起开启掘金创作之路。

Trick:纯demo,心在哪里,结果就在那里

#requests-redies
 
"""
所谓分布式爬虫,就是要用多台电脑同时爬取数据,相比于单机爬虫,分布式爬虫的爬取速度更快,
也能更好地应对IP的检测。本文介绍的是利用Redis数据库实现的分布式爬虫,Redis是一种常用的非
关系型数据库,常用数据类型包括String、Hash、Set、List和Sorted Set,重要的是Redis支持主从
复制,主机能将数据同步到从机,也就能够实现读写分离。因此我们可以利用Redis的特性,借助requests
模块发送请求,再解析网页和提取数据,实现一个简单的分布式爬虫。
#此文仿制于https://www.cnblogs.com/TM0831/p/11372833.html
"""
import random
import time,re
import requests
from lxml import etree
from redis import Redis
 
ua = UserAgent(verify_ssl=False)
headers = {
    "Connection": "keep-alive",
    "Cookie": "Hm_lvt_73f7fa8431ee92c8d44d7fe9b72394af=1565961296,1565961531,1565963461,1566014119; Hm_lpvt_73f7fa8431ee92c8d44d7fe9b72394af=1566024386",
    "Host": "www.shu800.com",
    "Referer": "http://www.shu800.com/",
    "User-Agent": ua.random
}
r = Redis(host="192.168.229.130", port=6379, db=1)
x = 1
 
 
def get_urls():
    """
    监听Redis中是否有URL,如果没有就一直运行,如果有就提取出来进行爬取
    :return:
    """
    if b"href" in r.keys():
        while True:
            try:
                url = r.spop("href")
                url = url.decode("utf-8")  # unicode转str
                print("Crawling URL: ", url)
                get_image(url)
                get_img_page(url)
            except:
                if b"href" not in r.keys():  # 爬取结束,退出程序
                    break
                else:
                    continue
    else:
        time.sleep(5)
        get_urls()
 
 
def get_img_page(url):
    """
    根据传入的图集URL,获取该图集的总页数及每一页的URL
    :param url: 图集URL
    :return:
    """
    try:
        time.sleep(random.random())
        res = requests.get(url, headers=headers)
        res.encoding = "utf-8"
        end_href = re.findall('下一页</a><a href="(/.*?.html)">尾页</a>', res.text)[0]  # 获取最后一页的url
        end_list = end_href.rstrip(".html").split('_')
        end_num = int(end_list[1])  # 获取最大页数
        for i in range(2, end_num + 1):
            page_url = "http://www.shu800.com" + end_list[0] + "_" + str(i) + ".html"
            get_image(page_url)
    except requests.exceptions:
        headers["User-Agent"] = ua.random
        get_img_page(url)
 
 
def get_image(url):
    """
    爬取图片展示页面,获取图片名称和链接进行下载
    :param url: 图片展示页URL
    :return:
    """
    global x
    if x < 10:
        x += 1
    else:
        x = 1
        time.sleep(random.randint(2, 5))
 
    res = requests.get(url, headers=headers)
    res.encoding = "utf-8"
    et = etree.HTML(res.text)
    try:
        img_name = et.xpath('/html/body/div[5]/div[1]/div[1]/div[2]/div[4]/span/p/img/@alt')  # 图片名称
        img_url = et.xpath('/html/body/div[5]/div[1]/div[1]/div[2]/div[4]/span/p/img/@src')  # 图片链接
        if img_url:
            img_name, img_url = img_name[0], img_url[0]
            if "_" not in url:
                img_name += "-1"
            else:
                num = url.rstrip(".html").split("_")[1]
                img_name = img_name + "-" + num
            # print(img_name)
            with open("Images/" + img_name + ".jpg", "wb") as f:  # 下载图片
                f.write(requests.get(img_url).content)
        else:
            pass
    except:
        pass
 
 
if __name__ == '__main__':
    get_urls()