python异步爬取某网站二次元图片

253 阅读2分钟

本文已参与「新人创作礼」活动,一起开启掘金创作之路。

import asyncio
import time
import aiohttp
import os
import re
import sys
import aiofiles
'''下面示例'''
urls = []
url_s = []
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40'}

k = int(input("请输入爬取页面数:"))
#
# for i in range(2, k + 1):
#     u = 'https://www.3gbizhi.com/tag/dongman/{}.html'.format(i)
#     urls.append(u)

filename = 'D:\多协程图片爬取\'





# async def f1():
#     print("f1运行1")
#     await asyncio.sleep(1)
#     print("f1运行2")
# async def f2():
#     print("f2运行1")
#     await asyncio.sleep(2)
#     print("f2运行2")
# async def f3():
#     print("f3运行1")
#     await asyncio.sleep(3)
#     print("f3运行2")
# async def main():
#     tasks = [
#         f1(),
#         f2(),
#         f3()
#     ]
#     await asyncio.wait(tasks)
# if __name__ == '__main__':
#     t1 = time.time()
#     asyncio.run(main())
#     t2 = time.time()
#     print(t2-t1)

async def houqu():
    global url_s
    for i in range(2, k + 1):  # urls是网页数地址
        u = 'https://www.3gbizhi.com/tag/dongman/{}.html'.format(i)
        urls.append(u)
        # await asyncio.sleep(2)  # 网络请求
        async with aiohttp.ClientSession() as session:
            async with session.get(url=u, headers=headers) as resp:
                html = await resp.text()
                ur_ = re.findall('<img lazysrc="(.*?)" lazysrc2x=".*?" height="348px" alt=".*?" title=".*?" />', html)
                url_s += ur_



async def download(url):#url是图片地址
    #获取图片地址
    name = url.split('/')[-1]


    print("准备下载")

                #resp.content.read() #相当于response.content
    async with aiohttp.ClientSession() as session1:
        async with session1.get(url=url, headers=headers) as resp1:
            async with aiofiles.open(filename+name, mode='wb') as f:
                await f.write(await resp1.content.read())



    # 相当于同步请求中的request
    # session.get()#类似打开文件,无需关闭,起个新名字罢了




    print("下载完成")
async def main():
    # urls = [
    #     'http://www.cgtpw.com/mnmx/index_2.html',
    #     'http://www.cgtpw.com/mnmx/index_3.html',
    #     'http://www.cgtpw.com/mnmx/index_4.html'
    # ]



                # ur_ 是 url的集合,图片地址的集合



    tasks = []
    await houqu()
    for url in url_s:

        d = download(url)
        tasks.append(asyncio.create_task(d))
    await asyncio.wait(tasks)


if __name__ == '__main__':


    t1 = time.time()
    asyncio.run(main())
    t2 = time.time()
    print('总花费时间:{}秒'.format(t2-t1))

异步爬取的效率非常高,400多张照片只要10秒,大家可以对比同步的,另外,我知乎的账号手机号无法换绑,想着转战掘金写博客,既激励自己,也方便感兴趣的小伙伴一起学习进步

讲一下技术原理和心得体会:

1.异步是利用CPU资源使用率,当一个任务完成了立马转接到下一个任务,不让它干其他事情

2.我们编写异步函数时,一定要在前面加上async,表示这是一个异步函数,在加入task前加上await,让程序识别出来

3.session.get相当于同步中的get,用with语法可以不用释放浏览器资源,

4. ’url.split('/')[-1]‘就是用split函数将文件名命名为URL最后一段数字加上jpg,不用我们再加了