《流畅的Python》读书笔记20(第十七章:使用future处理并发)

111 阅读1分钟

future指一种对象,表示异步执行的操作

17.1 网络下载的三种风格

  • 单线程
  • 多线程
  • 异步asyncio

17.1.1 单线程下载

import time

import requests

urls = [f"https://www.cnblogs.com/#p{page}" for page in range(1, 51)]


def craw(url):
    r = requests.get(url)
    print(url, len(r.text))

def single_thread():
    t0 = time.time()
    for url in urls:
        craw(url)
    elapsed = time.time() - t0
    print(f'single spider in {elapsed:.2f}s')
    
if __name__ == "__main__":
    single_thread()

耗时6.96s

17.1.2 多线程下载

import threading
import time

import requests

urls = [f"https://www.cnblogs.com/#p{page}" for page in range(1, 51)]

def craw(url):
    r = requests.get(url)
    print(url, len(r.text))

def multi_thread():
    t0 = time.time()
    threads = []
    for url in urls:
        threads.append(threading.Thread(target=craw, args=(url,)))

    for thread in threads:
        thread.start()

    for thread in threads:
        thread.join()

    elapsed = time.time() - t0
    print(f'single spider in {elapsed:.2f}s')

if __name__ == "__main__":
    multi_thread()

耗时0.49s

使用concurrent.future

# encoding:utf-8
import threading
import time
from concurrent.futures import ThreadPoolExecutor

import requests

urls = [f"https://www.cnblogs.com/#p{page}" for page in range(1, 51)]

def craw(url):
    r = requests.get(url)
    print(url, len(r.text))

def thread_pool():
    t0 = time.time()
    with ThreadPoolExecutor() as executor:
        results = executor.map(craw, urls)

    print(len(list(results)))

    elapsed = time.time() - t0
    print(f'single spider in {elapsed:.2f}s')

if __name__ == "__main__":
    thread_pool()

使用executor.submit 和 futures.as_completed

import concurrent
import threading
import time
from concurrent.futures import ThreadPoolExecutor

import requests

urls = [f"https://www.cnblogs.com/#p{page}" for page in range(1, 51)]

def craw(url):
    r = requests.get(url)
    print(url, len(r.text))

def thread_pool():
    t0 = time.time()
    with ThreadPoolExecutor() as executor:
        futures = {}
        for url in urls:
            future = executor.submit(craw, url)
            futures[future] = url

    # for future, url in futures.items():
    #     print(url, future.result())

    for future in concurrent.futures.as_completed(futures):
        url = futures[future]
        print(url, future.result())

    elapsed = time.time() - t0
    print(f'single spider in {elapsed:.2f}s')

if __name__ == "__main__":
    thread_pool()