future指一种对象,表示异步执行的操作
17.1 网络下载的三种风格
- 单线程
- 多线程
- 异步asyncio
17.1.1 单线程下载
import time
import requests
urls = [f"https://www.cnblogs.com/#p{page}" for page in range(1, 51)]
def craw(url):
r = requests.get(url)
print(url, len(r.text))
def single_thread():
t0 = time.time()
for url in urls:
craw(url)
elapsed = time.time() - t0
print(f'single spider in {elapsed:.2f}s')
if __name__ == "__main__":
single_thread()
耗时6.96s
17.1.2 多线程下载
import threading
import time
import requests
urls = [f"https://www.cnblogs.com/#p{page}" for page in range(1, 51)]
def craw(url):
r = requests.get(url)
print(url, len(r.text))
def multi_thread():
t0 = time.time()
threads = []
for url in urls:
threads.append(threading.Thread(target=craw, args=(url,)))
for thread in threads:
thread.start()
for thread in threads:
thread.join()
elapsed = time.time() - t0
print(f'single spider in {elapsed:.2f}s')
if __name__ == "__main__":
multi_thread()
耗时0.49s
使用concurrent.future
# encoding:utf-8
import threading
import time
from concurrent.futures import ThreadPoolExecutor
import requests
urls = [f"https://www.cnblogs.com/#p{page}" for page in range(1, 51)]
def craw(url):
r = requests.get(url)
print(url, len(r.text))
def thread_pool():
t0 = time.time()
with ThreadPoolExecutor() as executor:
results = executor.map(craw, urls)
print(len(list(results)))
elapsed = time.time() - t0
print(f'single spider in {elapsed:.2f}s')
if __name__ == "__main__":
thread_pool()
使用executor.submit 和 futures.as_completed
import concurrent
import threading
import time
from concurrent.futures import ThreadPoolExecutor
import requests
urls = [f"https://www.cnblogs.com/#p{page}" for page in range(1, 51)]
def craw(url):
r = requests.get(url)
print(url, len(r.text))
def thread_pool():
t0 = time.time()
with ThreadPoolExecutor() as executor:
futures = {}
for url in urls:
future = executor.submit(craw, url)
futures[future] = url
# for future, url in futures.items():
# print(url, future.result())
for future in concurrent.futures.as_completed(futures):
url = futures[future]
print(url, future.result())
elapsed = time.time() - t0
print(f'single spider in {elapsed:.2f}s')
if __name__ == "__main__":
thread_pool()