百行代码实现一个分布式爬虫

918 阅读1分钟

仅需三步

  1. 首先开发一个单机爬虫,需要set对url去重,List的pop与append实现任务队列
  2. 然后将set与List改用为Redis的set与List
  3. 最后使用多进程或者在多台服务器上跑起来即可

示例代码

# -*- coding: utf-8 -*-

# Author: 桑葚ICE
# Email: 152516cc@gmail.com
# Blog: iicey.github.io
# JueJin: juejin.im/user/5c64dce8e51d45013c40742c
import asyncio
import logging
import os

import aiohttp
import aioredis
from aiomultiprocess import Pool
from lxml import etree

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(os.path.split(os.path.basename(__file__))[0])
user = 'root'
password = '123546'
host = '127.0.0.1'
port = 6379


class MoreAuthor:

    def __init__(self, conn, redis):
        self.conn = conn
        self.redis = redis

    async def req_res(self, session, url: str, result_type: str = 'text'):
        if result_type not in {'text', 'bytes', 'json'}:
            raise TypeError(f"The result_type cannot be {result_type}. Use only: text, bytes, json!")
        if await self.redis.sismember('already_crawl_urls', url):
            logger.debug(f"重复的URL: {url}")
            return
        else:
            self.redis.sadd('already_crawl_urls', url)
        async with session.get(url, verify_ssl=False) as result:
            if result_type == 'text':
                return await result.text()
            if result_type == 'bytes':
                return await result.read()
            if result_type == 'json':
                return await result.json()

    async def product_and_consumer(self, session, main_url):
        if 'follow' not in main_url:
            main_url += '/follow?condition=0&p=1'
        main_result = await self.req_res(session, main_url)
        if main_result:
            logger.info(f"auth_main: {main_url}")
            author_url_list = await self.parse_more_author(session, main_result)
            for author_url in author_url_list:
                author_url = str(author_url)
                if not await self.redis.sismember('already_crawl_urls', author_url):
                    logger.info(f"新增作者URl:{author_url}")
                    await self.redis.sadd('already_crawl_urls', author_url)
                    await self.redis.rpush('author_url_queue', author_url)

    async def parse_more_author(self, session, main_result):
        tree = etree.HTML(main_result)
        author_url_list = list(tree.xpath('//*[@class="author-info-title"]/a/@href'))
        page_url_list = tree.xpath('//*[@id="laypage_0"]/a/@href')
        for page_url in page_url_list:
            if page_url[0] == '/':
                page_url = ''.join(['https://www.zcool.com.cn', page_url])
            page_result = await self.req_res(session, str(page_url))
            if page_result:
                # logger.info(f"author_page: {page_url}")
                next_author_url_list = await self.parse_more_author(session, page_result)
                author_url_list.extend(next_author_url_list)
        return author_url_list

    async def author_run(self):
        start_url = 'https://morncolour.zcool.com.cn/follow?condition=0&p=1'
        sleep_count = 0
        async with aiohttp.ClientSession() as session:
            if not await self.redis.sismember('already_crawl_urls', start_url):
                await asyncio.create_task(self.product_and_consumer(session, start_url))
            while 1:
                if sleep_count >= 60:
                    break
                if await self.redis.llen('author_url_queue') == 0:
                    sleep_count += 1
                    await asyncio.sleep(1)
                    continue
                sleep_count = 0
                task = asyncio.create_task(self.product_and_consumer
                                           (session, await self.redis.lpop('author_url_queue')))
                await task


async def run(num):
    logger.info(f"这是第{num}个进程任务")
    conn = await aioredis.create_pool(f'redis://{user}:{password}@{host}:{port}', encoding='utf-8')
    redis = aioredis.Redis(pool_or_conn=conn)
    m = MoreAuthor(conn, redis)
    await m.author_run()
    conn.close()
    await conn.wait_closed()


async def main():
    async with Pool() as pool:
        result = await pool.map(run, range(4))
        logger.info(result)


if __name__ == '__main__':
    asyncio.run(main())

项目地址

github.com/iicey/zcool