"""
假如有两个下载中间件的话,执行流程就是请求1--》请求2--》相应2--》响应1
假如发生异常了,可以在start_request后跟一个关键字参数,errback,而且中间件的执行顺序是请求1--》错误2--》错误1
错误回调会处理所有异常
中间件的请求都返回Response,如果不返回None,就不会走下一个中间件的请求,但是会走下一个中间件的响应
中间件的求情如果返回Request就会请求两次
请求发生异常中间件会捕获,响应发生异常爬虫errback会捕获
异常处理里面返回一个响应,就会从响应2开始走,然后走响应1,最后给callBack
"""
请求中间件中再次发起了一次请求,会在调用一个请求,相当于重复
请求中间件发起响应会不走后面中间件的请求,但是会走响应,3,2,1
请求发生异常,会走中间件的异常3,2,1,然后走errbck
响应中发生异常,会走errback
一场里面返回请求,就重新去请求,返回相应,就重新去响应
errback:捕获所有的一场
callback:走所有的响应
中间件执行的顺序
请求过程中,数字越小请求越先执行,然后要是请求过程中一个中间件发生错误,会先被start_requests中的errback捕获,要是没有errback就会被中间件中的最大一个中间件的exception捕获,要是一切都正常,就是请求1-》请求2-》响应2-》响应1
中间件中放置代理IP
我们在请求中放置代理ip,一旦发生一场,exception捕获之后,重新发起请求
css选择器
scrapy自动下载图片
# 在settings中
# 在pipline中,数字越小就越早经过
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline': 1,
}
# 然后设置图片的字段是哪一个
IMAGWE_URLS_FIELD = "front_image_url"
# 获取项目路径
project_dir = os.path.abspath(os.path.dirname(__file__))
# 设置图片存放路径
IMAGE_STORE = os.path.join(project_dir,"image")
# 在item中必须要有一个字段是front_image_url
eg:
class ArticleItem(scrapy.Item):
front_image_url = scrapy.Field(
# 在pipline中
from scrapy.pipelines.images import ImagesPipeline
class ArticleImagePipeline(ImagesPipeline):
def item_completed(self, results, item, info):
if "front_image_url" in item:
for ok, value in results:
image_file_path = value["path"]
item["front_image_path"] = image_file_path
return item
# 图片下载的命名最好使用md5加密算法,实现唯一字符串
import hashl
def get_md5(url):
if isinstance(url,str):
url = url.encode("utf-8")
m = hashlib.md5()
m.update(url)
return m.hexdigest()
article_item["url_object_id"] = get_md5(response.url)
# 最后,pipline一共是两个,我们一定要将图片下载的pipline后面的数值设置为最下啊,这样就会最先走图片处理这一个pipline
pipelines文件导出成json格式
# pipelines.py中
from scrapy.exporters import JsonItemExporter
class JsonExporterPipleline(object):
#调用scrapy提供的json export导出json文件
def __init__(self):
self.file = open('articleexport.json', 'wb')
self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
在设计存储表的时候,可以将md5加密后的字段作为主键
twisted异步IO网络框架
class MysqlTwistedPipline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
dbparms = dict(
host = settings["MYSQL_HOST"],
db = settings["MYSQL_DBNAME"],
user = settings["MYSQL_USER"],
passwd = settings["MYSQL_PASSWORD"],
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
#使用twisted将mysql插入变成异步执行
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error, item, spider) #处理异常
def handle_error(self, failure, item, spider):
# 处理异步插入的异常
print (failure)
def do_insert(self, cursor, item):
#执行具体的插入
#根据不同的item 构建不同的sql语句并插入到mysql中
insert_sql, params = item.get_insert_sql()
print (insert_sql, params)
cursor.execute(insert_sql, params)
scrapy-djangoitem
让我们像使用django model一样去操作数据库
scrapy item node
# jobbole.py
mport re
import scrapy
import datetime
from scrapy.http import Request
from urllib import parse
from scrapy.loader import ItemLoader
import time
from ArticleSpider.items import JobBoleArticleItem, ArticleItemLoader
from ArticleSpider.utils.common import get_md5
class JobboleSpider(scrapy.Spider):
name = "jobbole"
allowed_domains = ["python.jobbole.com"]
start_urls = ['http://python.jobbole.com/all-posts/']
def parse(self, response):
"""
1. 获取文章列表页中的文章url并交给scrapy下载后并进行解析
2. 获取下一页的url并交给scrapy进行下载, 下载完成后交给parse
"""
#解析列表页中的所有文章url并交给scrapy下载后并进行解析
post_nodes = response.css("#archive .floated-thumb .post-thumb a")
for post_node in post_nodes:
image_url = post_node.css("img::attr(src)").extract_first("")
post_url = post_node.css("::attr(href)").extract_first("")
yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url":image_url}, callback=self.parse_detail)
#提取下一页并交给scrapy进行下载
next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
if next_url:
yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse_detail(self, response):
article_item = JobBoleArticleItem()
front_image_url = response.meta.get("front_image_url", "") # 文章封面图
item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
item_loader.add_css("title", ".entry-header h1::text")
item_loader.add_value("url", response.url)
item_loader.add_value("url_object_id", get_md5(response.url))
item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
item_loader.add_value("front_image_url", [front_image_url])
item_loader.add_css("praise_nums", ".vote-post-up h10::text")
item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
item_loader.add_css("fav_nums", ".bookmark-btn::text")
item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
item_loader.add_css("content", "div.entry")
article_item = item_loader.load_item()
yield article_item
# items.py
def add_jobbole(value):
return value+"-bobby"
def date_convert(value):
try:
create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date()
except Exception as e:
create_date = datetime.datetime.now().date()
return create_date
def get_nums(value):
match_re = re.match(".*?(\d+).*", value)
if match_re:
nums = int(match_re.group(1))
else:
nums = 0
return nums
def return_value(value):
return value
def remove_comment_tags(value):
#去掉tag中提取的评论
if "评论" in value:
return ""
else:
return value
class ArticleItemLoader(ItemLoader):
#自定义itemloader
default_output_processor = TakeFirst()
class JobBoleArticleItem(scrapy.Item):
title = scrapy.Field()
create_date = scrapy.Field(
input_processor=MapCompose(date_convert),
)
url = scrapy.Field()
url_object_id = scrapy.Field()
front_image_url = scrapy.Field(
output_processor=MapCompose(return_value)
)
front_image_path = scrapy.Field()
praise_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
comment_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
fav_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
tags = scrapy.Field(
input_processor=MapCompose(remove_comment_tags),
output_processor=Join(",")
)
content = scrapy.Field()
def get_insert_sql(self):
insert_sql = """
insert into jobbole_article(title, url, create_date, fav_nums, front_image_url, front_image_path,
praise_nums, comment_nums, tags, content)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(fav_nums)
"""
fron_image_url = ""
# content = remove_tags(self["content"])
if self["front_image_url"]:
fron_image_url = self["front_image_url"][0]
params = (self["title"], self["url"], self["create_date"], self["fav_nums"],
fron_image_url, self["front_image_path"], self["praise_nums"], self["comment_nums"],
self["tags"], self["content"])
return insert_sql, params
requests_html爬取美桌
import hashlib
import os
import time
import requests
import requests_html
import re
import pandas
import numpy
import scrapy
from fake_useragent import UserAgent
import logging
import threading
from concurrent.futures.thread import ThreadPoolExecutor
"""
1 进入rili 的url
2 for element in elements:
2.1获取图标的url,进入之后获取大图
2.2 解析图片,每一组图片都是一个dict = {“相册名字”:[url1.jpg,url2.jpg,...]}
2.3 列表中存储图片url[dict,dict,...]
3 循环之后,获取next元素,有的话进入,再次执行2
4 下载图片使用字典的key作为文件夹的名字,md5作为图片的名字
"""
class Spider:
def __init__(self):
self.ua = UserAgent()
self.session = requests_html.HTMLSession()
self.headers = {
'Cookie': 'XSRF-TOKEN=eyJpdiI6IldLODA2ekVEb0pXV2JBVHF0UjBteEE9PSIsInZhbHVlIjoiRkhrRm0rTkRXSVNJbXViNlpqMHdHZ0lEWkJcL2RYdFNPUHE4dEE0dytCNm8xNGI5WTk1ckpSbWVlV2FzM3hqWU9GK013V1hpTGpKbklSK0diUGJkczlcLzU3aUd2Y2NJdEhta1NQVTQ1NTZidTlGK2NYU3dlMGZTeUhpTDg4WFg0TSIsIm1hYyI6ImViNjlhYWY2ZDQ0MTRjMGZlNTUxZGEwZjFlYTczYWNiZjgyYzUxMTliYmUzNmZmYjU4Zjc0OWE4OGIxNzNkZjkifQ%3D%3D; win4000_session=eyJpdiI6IlMya3J1MERXM1A3VVg5ejk0djNsYkE9PSIsInZhbHVlIjoiQnhxZkZ4ZVFrOFUybVNlUDFRcHFla2thOFVFdkFPTHZlT1B6Z2FNbWtUWks0SkZpVVVmbm1MMXh2ZzMzRTZ2bDdTcjhUNm9nc1VmbVBVaHNcL2c2OHhSK0hYM3BBaVpESFwvaWs4d3A5aWFQWnNndGVKS1pjVmlkWU9reFEzSjhwUiIsIm1hYyI6ImYzYjU5YTBmNWNiMjk2MjVmM2Q4ZTc3NmJiNmU3MjllZjIxN2EzZjkzY2JkMDhkYjljYWQ0Y2FlMGEyYWQ0ZmUifQ%3D%3D; security_session_verify=96061df658b4b77469e02c406543b820',
'Connection': 'keep-alive',
'Referer': 'http://www.win4000.com/wallpaper.html',
'Host': 'www.win4000.com'
}
self.base_url = 'http://www.win4000.com/wallpaper_0_0_0_1.html'
def parse(self):
logging.info("解析")
headers = self.headers.update({'User-Agent': self.ua.random})
response = self.session.get(headers=headers, url=self.base_url, )
with ThreadPoolExecutor(10) as executor:
for btn in response.html.find('body > div.main > div > div.product_query > div:nth-child(1) > div > div>a')[1:]:
print(btn)
print(btn.text)
executor.submit(self.parse1, btn)
def parse1(self, btn):
base_url = btn.attrs['href']
folder_name = btn.text
headers = self.headers.update({'User-Agent': self.ua.random})
response = self.session.get(headers=headers, url=base_url, )
last_page = response.html.find('.pages a')[-2].text # 最后一页的数
for i in range(1, int(last_page) + 1):
self.base_url = self.base_url.replace('.html', f'_{i}.html')
headers = self.headers.update({'User-Agent': self.ua.random})
response = self.session.get(headers=headers, url=base_url, )
elements = response.html.find('.Left_list_cont .tab_tj .tab_box .clearfix li a')
# 循环当前页每一个图
num = 0
for element in elements:
num += 1
self.into_big_img(element, folder_name)
def into_big_img(self, element, folder_name):
logging.info("进入相册")
"""
进入图标,获取大图中的相册,变成字典
:param sub_url:
:return:
"""
sub_url = element.attrs['href']
headers = self.headers.update({'User-Agent': self.ua.random})
response = self.session.get(headers=headers, url=sub_url)
max_page_num = int(response.html.find('.ptitle em', first=True).text) # 获取大图的页数
# 获取大图的标题
title = response.html.find('.ptitle h1', first=True).text \
+ '(' + response.html.find('.ptitle span', first=True).text \
+ '-' + str(max_page_num) + ')'
img_dict = {title: []}
for page in range(1, max_page_num + 1): # 循环获取每一个大图
url = sub_url.split('.html')[0] + '_' + str(page) + ".html"
headers = self.headers.update({'User-Agent': self.ua.random})
response = self.session.get(headers=headers, url=url)
img_url = response.html.find('.pic-meinv a img')[0].attrs['src']
img_dict[title].append(img_url)
self.download_img(img_dict, folder_name)
# 下载,分目录存放
def download_img(self, img_all_dict, folder_name):
logging.info("下载")
for k, v in img_all_dict.items():
path = f"E:/美桌图片/{folder_name}" + "/" + k
folder = os.path.exists(path)
print(folder)
if not folder:
os.makedirs(path)
for img in v:
content = requests.get(img).content
img_name = str(self.hash_md5())
f = open(f'{path}/{img_name}.jpg', 'ab') # 存储图片,多媒体文件需要参数b(二进制文件)
f.write(content) # 多媒体存储content
f.close()
def hash_md5(self):
m = hashlib.md5() # 定义加密对象
mad_str = str(time.time())
m.update(mad_str.encode('utf-8'))
img_name = m.hexdigest()
return img_name
if __name__ == '__main__':
spider = Spider()
spider.parse()
# spider.into_big_img()
# spider.download_img()