Python多进程、多线程、多协程爬取数据的demo

70 阅读4分钟

单进程单线程

import requests

import time

from bs4 import BeautifulSoup



head = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',

    'Referer': 'https://time.geekbang.org',

    'Connection': 'keep-alive'}



def fetch_content(url_to_fetch):

    return requests.get(url_to_fetch, headers=head).content



def main():

    url = "https://movie.douban.com/cinema/later/beijing/"

    init_page = requests.get(url, headers=head).content

    init_soup = BeautifulSoup(init_page, 'html.parser')

    movie_names, urls_to_fetch, movie_dates = [], [], []

    all_movies = init_soup.find('div', id="showing-soon")

    for each_movie in all_movies.find_all('div', class_="item"):

        all_a_tag = each_movie.find_all('a')

        all_li_tag = each_movie.find_all('li')

        movie_names.append(all_a_tag[1].text)

        urls_to_fetch.append(all_a_tag[1]['href'])

        movie_dates.append(all_li_tag[0].text)

    pages = [fetch_content(url) for url in urls_to_fetch]

    for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):

        soup_item = BeautifulSoup(page, 'html.parser')

        img_tag = soup_item.find('img')

        print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))





if __name__ == "__main__":

    start_time = time.perf_counter()

    main()

    end_time = time.perf_counter()

    print('scrape sites in {} seconds'.format(end_time - start_time))



# scrape sites in 13.5433963 seconds

单线程多协程

import time

import asyncio

import aiohttp



from bs4 import BeautifulSoup



header = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',

    'Referer': 'https://time.geekbang.org/column/article/101855',

    'Connection': 'keep-alive'}



async def fetch_content(url):

    async with aiohttp.ClientSession(

            headers=header, connector=aiohttp.TCPConnector(ssl=False)

    ) as session:

        async with session.get(url) as response:

            return await response.text()





async def main():

    url = "https://movie.douban.com/cinema/later/beijing/"

    init_page = await fetch_content(url)

    init_soup = BeautifulSoup(init_page, 'html.parser')

    movie_names, urls_to_fetch, movie_dates = [], [], []

    all_movies = init_soup.find('div', id="showing-soon")

    for each_movie in all_movies.find_all('div', class_="item"):

        all_a_tag = each_movie.find_all('a')

        all_li_tag = each_movie.find_all('li')

        movie_names.append(all_a_tag[1].text)

        urls_to_fetch.append(all_a_tag[1]['href'])

        movie_dates.append(all_li_tag[0].text)

    tasks = [asyncio.create_task(fetch_content(url)) for url in urls_to_fetch]

    pages = await asyncio.gather(*tasks)

    for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):

        soup_item = BeautifulSoup(page, 'html.parser')

        img_tag = soup_item.find('img')

        print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))





if __name__ == "__main__":

    start_time = time.perf_counter()

    asyncio.run(main())

    end_time = time.perf_counter()

    print('scrape sites in {} seconds'.format(end_time - start_time))



# scrape sites in 3.007893 seconds

单进程多线程

MyThread类封装

import time

import requests

from threading import Thread

from bs4 import BeautifulSoup



header = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',

    'Referer': 'https://time.geekbang.org/column/article/101855',

    'Connection': 'keep-alive'}





def fetch_content(url_to_fetch):

    return requests.get(url_to_fetch, headers=header).content





class MyThread(Thread):

    def __init__(self, func, args):

        super(MyThread, self).__init__()

        self.func = func

        self.args = args



    def run(self):

        self.result = self.func(*self.args)



    def get_result(self):

        try:

            return self.result

        except Exception:

            return None





def fetch_all(sites):

    threads = []

    results = []

    for site in sites:

        t = MyThread(fetch_content, args=(site,))

        threads.append(t)

    for t in threads:

        t.start()

    for t in threads:

        t.join()

        results.append(t.get_result())

    return results





def main():

    url = "https://movie.douban.com/cinema/later/beijing/"

    init_page = fetch_content(url)

    init_soup = BeautifulSoup(init_page, 'html.parser')

    movie_names, urls_to_fetch, movie_dates = [], [], []

    all_movies = init_soup.find('div', id="showing-soon")

    for each_movie in all_movies.find_all('div', class_="item"):

        all_a_tag = each_movie.find_all('a')

        all_li_tag = each_movie.find_all('li')

        movie_names.append(all_a_tag[1].text)

        urls_to_fetch.append(all_a_tag[1]['href'])

        movie_dates.append(all_li_tag[0].text)

    pages = fetch_all(urls_to_fetch)

    for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):

        soup_item = BeautifulSoup(page, 'html.parser')

        img_tag = soup_item.find('img')

        print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))





if __name__ == "__main__":

    start_time = time.perf_counter()

    main()

    end_time = time.perf_counter()

    print('scrape sites in {} seconds'.format(end_time - start_time))



# scrape sites in 3.1128052000000004 seconds

Futures类

import concurrent.futures

import time

import requests

from bs4 import BeautifulSoup



header = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',

    'Referer': 'https://time.geekbang.org/column/article/101855',

    'Connection': 'keep-alive'}





def fetch_content(url_to_fetch):

    return requests.get(url_to_fetch, headers=header).content





def fetch_all(sites):

    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:

        to_do = []

        results = []

        for site in sites:

            future = executor.submit(fetch_content, site)

            to_do.append(future)

        for future in concurrent.futures.as_completed(to_do):

            results.append(future.result())

        return results





def main():

    url = "https://movie.douban.com/cinema/later/beijing/"

    init_page = fetch_content(url)

    init_soup = BeautifulSoup(init_page, 'html.parser')

    movie_names, urls_to_fetch, movie_dates = [], [], []

    all_movies = init_soup.find('div', id="showing-soon")

    for each_movie in all_movies.find_all('div', class_="item"):

        all_a_tag = each_movie.find_all('a')

        all_li_tag = each_movie.find_all('li')

        movie_names.append(all_a_tag[1].text)

        urls_to_fetch.append(all_a_tag[1]['href'])

        movie_dates.append(all_li_tag[0].text)

    pages = fetch_all(urls_to_fetch)

    for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):

        soup_item = BeautifulSoup(page, 'html.parser')

        img_tag = soup_item.find('img')

        print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))





if __name__ == "__main__":

    start_time = time.perf_counter()

    main()

    end_time = time.perf_counter()

    print('scrape sites in {} seconds'.format(end_time - start_time))



# scrape sites in 4.3634426 seconds

多进程单线程

全局变量共享

import time

import requests

import multiprocessing

from multiprocessing import Manager

from bs4 import BeautifulSoup



header = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',

    'Referer': 'https://time.geekbang.org/column/article/101855',

    'Connection': 'keep-alive'}





def fetch_content(url_to_fetch):

    return requests.get(url_to_fetch, headers=header).content





def worker(site, return_dict):

    return_dict[site] = fetch_content(site)





def fetch_all(sites):

    manager = Manager()

    result_dict = manager.dict()

    jobs = []

    for site in sites:

        p = multiprocessing.Process(target=worker, args=(site, result_dict))

        jobs.append(p)

    for p in jobs:

        p.start()

    for p in jobs:

        p.join()

    return result_dict.values()





def main():

    url = "https://movie.douban.com/cinema/later/beijing/"

    init_page = fetch_content(url)

    init_soup = BeautifulSoup(init_page, 'html.parser')

    movie_names, urls_to_fetch, movie_dates = [], [], []

    all_movies = init_soup.find('div', id="showing-soon")

    for each_movie in all_movies.find_all('div', class_="item"):

        all_a_tag = each_movie.find_all('a')

        all_li_tag = each_movie.find_all('li')

        movie_names.append(all_a_tag[1].text)

        urls_to_fetch.append(all_a_tag[1]['href'])

        movie_dates.append(all_li_tag[0].text)

    pages = fetch_all(urls_to_fetch)

    for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):

        soup_item = BeautifulSoup(page, 'html.parser')

        img_tag = soup_item.find('img')

        print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))





if __name__ == "__main__":

    start_time = time.perf_counter()

    main()

    end_time = time.perf_counter()

    print('scrape sites in {} seconds'.format(end_time - start_time))



# scrape sites in 6.8475984 seconds

Futures类

import concurrent.futures

import time

import requests

from bs4 import BeautifulSoup



header = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',

    'Referer': 'https://time.geekbang.org/column/article/101855',

    'Connection': 'keep-alive'}



def fetch_content(url_to_fetch):

    return requests.get(url_to_fetch, headers=header).content



def fetch_all(sites):

    with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:

        to_do = []

        results = []

        for site in sites:

            future = executor.submit(fetch_content, site)

            to_do.append(future)

        for future in concurrent.futures.as_completed(to_do):

            results.append(future.result())

        return results



def main():

    url = "https://movie.douban.com/cinema/later/beijing/"

    init_page = fetch_content(url)

    init_soup = BeautifulSoup(init_page, 'html.parser')

    movie_names, urls_to_fetch, movie_dates = [], [], []

    all_movies = init_soup.find('div', id="showing-soon")

    for each_movie in all_movies.find_all('div', class_="item"):

        all_a_tag = each_movie.find_all('a')

        all_li_tag = each_movie.find_all('li')

        movie_names.append(all_a_tag[1].text)

        urls_to_fetch.append(all_a_tag[1]['href'])

        movie_dates.append(all_li_tag[0].text)

    pages = fetch_all(urls_to_fetch)

    for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):

        soup_item = BeautifulSoup(page, 'html.parser')

        img_tag = soup_item.find('img')

        print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))



if __name__ == "__main__":

    start_time = time.perf_counter()

    main()

    end_time = time.perf_counter()

    print('scrape sites in {} seconds'.format(end_time - start_time))



# scrape sites in 5.8536791 seconds