利用多线程进行爬虫程序加速

257 阅读1分钟
  1. 基本代码
# ok 文件
import requests
urls = [
	f"https://www.cnblogs.com/#p{page}"
	for page in range(1,50+1)
]
def craw(url):
	r = requests.get(url)
	print(url,len(r.text))

craw(urls[0])
import ok
import threading
def single_thread():
	for url in ok.urls:
		ok.craw(url)


def multi_thread():
	threads = []
	for url in ok.urls:
		threads.append(	 threading.Thread(target = ok.craw, args=(url,)) )

	for thread in threads:
		thread.start()

	for thread in threads:
		thread.join()

if __name__ == '__main__':
	single_thread()
	multi_thread()

多组件的 pipeline 技术架构

  1. ok.py
import requests
from bs4 import BeautifulSoup

urls = [
	f"https://www.cnblogs.com/#p{page}"
	for page in range(1,50+1)
]
def craw(url):
	r = requests.get(url)
	return r.text

def parse(html):
	soup = BeautifulSoup(html,'html.parser')
	links = soup.find_all("a", class_="post-item-title")
	return [(link["href"],link.get_text()) for link in links]


if __name__ == "__main__":
	for result in parse(craw(urls[2])):
		print(result)
  1. 升级版的爬虫
import time, random
import queue
import ok
import threading
def do_craw(url_queue:queue.Queue,html_queue:queue.Queue):
	while True:
		url = url_queue.get()
		html = ok.craw(url)
		html_queue.put(html)
		print(threading.current_thread().name,f"craw{url}","url_queue.size=",url_queue.qsize())
		time.sleep(random.randint(1,2))


def do_parse(html_queue:queue.Queue,fout):
	while True:
		html = html_queue.get()
		results = ok.parse(html)
		for result in results:
			fout.write(str(result)+"\n")
		print (threading.current_thread().name,f"craw{url}", "url_queue.size=", url_queue.qsize())
		time.sleep(random.randint(1, 2))

if __name__=="__main__":
	url_queue = queue.Queue()
	html_queue = queue.Queue()
	for url in ok.urls:
		url_queue.put(url) # 生产者

	for idx in range(3):
		t = threading.Thread(target= do_craw,args=(url_queue,html_queue),name=f"craw{idx}")
		t.start()

	fout = open("02.txt","w")
	for idx in range(2):
		t = threading.Thread(target=do_parse,args=(html_queue,fout),name=f"parse{idx}")
		t.start()