本文已参与「新人创作礼」活动,一起开启掘金创作之路。
import asyncio
import time
import aiohttp
import os
import re
import sys
import aiofiles
'''下面示例'''
urls = []
url_s = []
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.40'}
k = int(input("请输入爬取页面数:"))
#
# for i in range(2, k + 1):
# u = 'https://www.3gbizhi.com/tag/dongman/{}.html'.format(i)
# urls.append(u)
filename = 'D:\多协程图片爬取\'
# async def f1():
# print("f1运行1")
# await asyncio.sleep(1)
# print("f1运行2")
# async def f2():
# print("f2运行1")
# await asyncio.sleep(2)
# print("f2运行2")
# async def f3():
# print("f3运行1")
# await asyncio.sleep(3)
# print("f3运行2")
# async def main():
# tasks = [
# f1(),
# f2(),
# f3()
# ]
# await asyncio.wait(tasks)
# if __name__ == '__main__':
# t1 = time.time()
# asyncio.run(main())
# t2 = time.time()
# print(t2-t1)
async def houqu():
global url_s
for i in range(2, k + 1): # urls是网页数地址
u = 'https://www.3gbizhi.com/tag/dongman/{}.html'.format(i)
urls.append(u)
# await asyncio.sleep(2) # 网络请求
async with aiohttp.ClientSession() as session:
async with session.get(url=u, headers=headers) as resp:
html = await resp.text()
ur_ = re.findall('<img lazysrc="(.*?)" lazysrc2x=".*?" height="348px" alt=".*?" title=".*?" />', html)
url_s += ur_
async def download(url):#url是图片地址
#获取图片地址
name = url.split('/')[-1]
print("准备下载")
#resp.content.read() #相当于response.content
async with aiohttp.ClientSession() as session1:
async with session1.get(url=url, headers=headers) as resp1:
async with aiofiles.open(filename+name, mode='wb') as f:
await f.write(await resp1.content.read())
# 相当于同步请求中的request
# session.get()#类似打开文件,无需关闭,起个新名字罢了
print("下载完成")
async def main():
# urls = [
# 'http://www.cgtpw.com/mnmx/index_2.html',
# 'http://www.cgtpw.com/mnmx/index_3.html',
# 'http://www.cgtpw.com/mnmx/index_4.html'
# ]
# ur_ 是 url的集合,图片地址的集合
tasks = []
await houqu()
for url in url_s:
d = download(url)
tasks.append(asyncio.create_task(d))
await asyncio.wait(tasks)
if __name__ == '__main__':
t1 = time.time()
asyncio.run(main())
t2 = time.time()
print('总花费时间:{}秒'.format(t2-t1))
异步爬取的效率非常高,400多张照片只要10秒,大家可以对比同步的,另外,我知乎的账号手机号无法换绑,想着转战掘金写博客,既激励自己,也方便感兴趣的小伙伴一起学习进步
讲一下技术原理和心得体会: