单进程单线程
import requests
import time
from bs4 import BeautifulSoup
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1
'Referer': 'https://time.geekbang.org',
'Connection': 'keep-alive'}
def fetch_content(url_to_fetch):
return requests.get(url_to_fetch, headers=head).content
def main():
url = "https://movie.douban.com/cinema/later/beijing/"
init_page = requests.get(url, headers=head).content
init_soup = BeautifulSoup(init_page, 'html.parser')
movie_names, urls_to_fetch, movie_dates = [], [], []
all_movies = init_soup.find('div', id="showing-soon")
for each_movie in all_movies.find_all('div', class_="item"):
all_a_tag = each_movie.find_all('a')
all_li_tag = each_movie.find_all('li')
movie_names.append(all_a_tag[1].text)
urls_to_fetch.append(all_a_tag[1]['href'])
movie_dates.append(all_li_tag[0].text)
pages = [fetch_content(url) for url in urls_to_fetch]
for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):
soup_item = BeautifulSoup(page, 'html.parser')
img_tag = soup_item.find('img')
print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))
if __name__ == "__main__":
start_time = time.perf_counter()
main()
end_time = time.perf_counter()
print('scrape sites in {} seconds'.format(end_time - start_time))
单线程多协程
import time
import asyncio
import aiohttp
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1
'Referer': 'https://time.geekbang.org/column/article/101855',
'Connection': 'keep-alive'}
async def fetch_content(url):
async with aiohttp.ClientSession(
headers=header, connector=aiohttp.TCPConnector(ssl=False)
) as session:
async with session.get(url) as response:
return await response.text()
async def main():
url = "https://movie.douban.com/cinema/later/beijing/"
init_page = await fetch_content(url)
init_soup = BeautifulSoup(init_page, 'html.parser')
movie_names, urls_to_fetch, movie_dates = [], [], []
all_movies = init_soup.find('div', id="showing-soon")
for each_movie in all_movies.find_all('div', class_="item"):
all_a_tag = each_movie.find_all('a')
all_li_tag = each_movie.find_all('li')
movie_names.append(all_a_tag[1].text)
urls_to_fetch.append(all_a_tag[1]['href'])
movie_dates.append(all_li_tag[0].text)
tasks = [asyncio.create_task(fetch_content(url)) for url in urls_to_fetch]
pages = await asyncio.gather(*tasks)
for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):
soup_item = BeautifulSoup(page, 'html.parser')
img_tag = soup_item.find('img')
print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))
if __name__ == "__main__":
start_time = time.perf_counter()
asyncio.run(main())
end_time = time.perf_counter()
print('scrape sites in {} seconds'.format(end_time - start_time))
单进程多线程
MyThread类封装
import time
import requests
from threading import Thread
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1
'Referer': 'https://time.geekbang.org/column/article/101855',
'Connection': 'keep-alive'}
def fetch_content(url_to_fetch):
return requests.get(url_to_fetch, headers=header).content
class MyThread(Thread):
def __init__(self, func, args):
super(MyThread, self).__init__()
self.func = func
self.args = args
def run(self):
self.result = self.func(*self.args)
def get_result(self):
try:
return self.result
except Exception:
return None
def fetch_all(sites):
threads = []
results = []
for site in sites:
t = MyThread(fetch_content, args=(site,))
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
results.append(t.get_result())
return results
def main():
url = "https://movie.douban.com/cinema/later/beijing/"
init_page = fetch_content(url)
init_soup = BeautifulSoup(init_page, 'html.parser')
movie_names, urls_to_fetch, movie_dates = [], [], []
all_movies = init_soup.find('div', id="showing-soon")
for each_movie in all_movies.find_all('div', class_="item"):
all_a_tag = each_movie.find_all('a')
all_li_tag = each_movie.find_all('li')
movie_names.append(all_a_tag[1].text)
urls_to_fetch.append(all_a_tag[1]['href'])
movie_dates.append(all_li_tag[0].text)
pages = fetch_all(urls_to_fetch)
for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):
soup_item = BeautifulSoup(page, 'html.parser')
img_tag = soup_item.find('img')
print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))
if __name__ == "__main__":
start_time = time.perf_counter()
main()
end_time = time.perf_counter()
print('scrape sites in {} seconds'.format(end_time - start_time))
Futures类
import concurrent.futures
import time
import requests
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1
'Referer': 'https://time.geekbang.org/column/article/101855',
'Connection': 'keep-alive'}
def fetch_content(url_to_fetch):
return requests.get(url_to_fetch, headers=header).content
def fetch_all(sites):
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
to_do = []
results = []
for site in sites:
future = executor.submit(fetch_content, site)
to_do.append(future)
for future in concurrent.futures.as_completed(to_do):
results.append(future.result())
return results
def main():
url = "https://movie.douban.com/cinema/later/beijing/"
init_page = fetch_content(url)
init_soup = BeautifulSoup(init_page, 'html.parser')
movie_names, urls_to_fetch, movie_dates = [], [], []
all_movies = init_soup.find('div', id="showing-soon")
for each_movie in all_movies.find_all('div', class_="item"):
all_a_tag = each_movie.find_all('a')
all_li_tag = each_movie.find_all('li')
movie_names.append(all_a_tag[1].text)
urls_to_fetch.append(all_a_tag[1]['href'])
movie_dates.append(all_li_tag[0].text)
pages = fetch_all(urls_to_fetch)
for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):
soup_item = BeautifulSoup(page, 'html.parser')
img_tag = soup_item.find('img')
print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))
if __name__ == "__main__":
start_time = time.perf_counter()
main()
end_time = time.perf_counter()
print('scrape sites in {} seconds'.format(end_time - start_time))
多进程单线程
全局变量共享
import time
import requests
import multiprocessing
from multiprocessing import Manager
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1
'Referer': 'https://time.geekbang.org/column/article/101855',
'Connection': 'keep-alive'}
def fetch_content(url_to_fetch):
return requests.get(url_to_fetch, headers=header).content
def worker(site, return_dict):
return_dict[site] = fetch_content(site)
def fetch_all(sites):
manager = Manager()
result_dict = manager.dict()
jobs = []
for site in sites:
p = multiprocessing.Process(target=worker, args=(site, result_dict))
jobs.append(p)
for p in jobs:
p.start()
for p in jobs:
p.join()
return result_dict.values()
def main():
url = "https://movie.douban.com/cinema/later/beijing/"
init_page = fetch_content(url)
init_soup = BeautifulSoup(init_page, 'html.parser')
movie_names, urls_to_fetch, movie_dates = [], [], []
all_movies = init_soup.find('div', id="showing-soon")
for each_movie in all_movies.find_all('div', class_="item"):
all_a_tag = each_movie.find_all('a')
all_li_tag = each_movie.find_all('li')
movie_names.append(all_a_tag[1].text)
urls_to_fetch.append(all_a_tag[1]['href'])
movie_dates.append(all_li_tag[0].text)
pages = fetch_all(urls_to_fetch)
for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):
soup_item = BeautifulSoup(page, 'html.parser')
img_tag = soup_item.find('img')
print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))
if __name__ == "__main__":
start_time = time.perf_counter()
main()
end_time = time.perf_counter()
print('scrape sites in {} seconds'.format(end_time - start_time))
Futures类
import concurrent.futures
import time
import requests
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1
'Referer': 'https://time.geekbang.org/column/article/101855',
'Connection': 'keep-alive'}
def fetch_content(url_to_fetch):
return requests.get(url_to_fetch, headers=header).content
def fetch_all(sites):
with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
to_do = []
results = []
for site in sites:
future = executor.submit(fetch_content, site)
to_do.append(future)
for future in concurrent.futures.as_completed(to_do):
results.append(future.result())
return results
def main():
url = "https://movie.douban.com/cinema/later/beijing/"
init_page = fetch_content(url)
init_soup = BeautifulSoup(init_page, 'html.parser')
movie_names, urls_to_fetch, movie_dates = [], [], []
all_movies = init_soup.find('div', id="showing-soon")
for each_movie in all_movies.find_all('div', class_="item"):
all_a_tag = each_movie.find_all('a')
all_li_tag = each_movie.find_all('li')
movie_names.append(all_a_tag[1].text)
urls_to_fetch.append(all_a_tag[1]['href'])
movie_dates.append(all_li_tag[0].text)
pages = fetch_all(urls_to_fetch)
for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):
soup_item = BeautifulSoup(page, 'html.parser')
img_tag = soup_item.find('img')
print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))
if __name__ == "__main__":
start_time = time.perf_counter()
main()
end_time = time.perf_counter()
print('scrape sites in {} seconds'.format(end_time - start_time))