基于scrapy内置数据收集机制显示爬取效率
近段时间一直在研究如何有效的统计scrapy的抓取进度了解到scrapy自带的数据收集接口
直接po代码!~写在middleware中
from scrapy import signals
from twisted.internet import task
def __init__(self, stats):
self.stats = stats
#每隔多少秒监控一次已抓取数量
self.time = 10.0
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
instance = cls(crawler.stats)
crawler.signals.connect(instance.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)
return instance
def spider_opened(self):
self.tsk = task.LoopingCall(self.collect)
self.tsk.start(self.time, now=True)
def spider_closed(self):
scrapy_count = self.stats.get_value('item_scraped_count')
print(scrapy_count)
if self.tsk.running:
self.tsk.stop()
def collect(self):
# 这里收集stats并写入相关的储存。
# 目前展示是输出到终端
scrapy_count = self.stats.get_value('item_scraped_count')
if scrapy_count:
print(scrapy_count)