Scrapy开发记录

185 阅读6分钟
"""
假如有两个下载中间件的话,执行流程就是请求1--》请求2--》相应2--》响应1
假如发生异常了,可以在start_request后跟一个关键字参数,errback,而且中间件的执行顺序是请求1--》错误2--》错误1
错误回调会处理所有异常
中间件的请求都返回Response,如果不返回None,就不会走下一个中间件的请求,但是会走下一个中间件的响应
中间件的求情如果返回Request就会请求两次
请求发生异常中间件会捕获,响应发生异常爬虫errback会捕获
异常处理里面返回一个响应,就会从响应2开始走,然后走响应1,最后给callBack

"""

1607497093446

请求中间件中再次发起了一次请求,会在调用一个请求,相当于重复

请求中间件发起响应会不走后面中间件的请求,但是会走响应,3,2,1

请求发生异常,会走中间件的异常3,2,1,然后走errbck

响应中发生异常,会走errback

一场里面返回请求,就重新去请求,返回相应,就重新去响应

errback:捕获所有的一场

callback:走所有的响应

中间件执行的顺序

请求过程中,数字越小请求越先执行,然后要是请求过程中一个中间件发生错误,会先被start_requests中的errback捕获,要是没有errback就会被中间件中的最大一个中间件的exception捕获,要是一切都正常,就是请求1-》请求2-》响应2-》响应1

中间件中放置代理IP

我们在请求中放置代理ip,一旦发生一场,exception捕获之后,重新发起请求

github.com/jhao104/pro…

css选择器

1607906220566

1607906243300

scrapy自动下载图片

# 在settings中
# 在pipline中,数字越小就越早经过
ITEM_PIPELINES = {
    'scrapy.pipelines.images.ImagesPipeline': 1,
}

# 然后设置图片的字段是哪一个
IMAGWE_URLS_FIELD = "front_image_url"
# 获取项目路径
project_dir = os.path.abspath(os.path.dirname(__file__))
# 设置图片存放路径
IMAGE_STORE = os.path.join(project_dir,"image")


# 在item中必须要有一个字段是front_image_url
eg:
class ArticleItem(scrapy.Item):
     front_image_url = scrapy.Field(
    
    
# 在pipline中
from scrapy.pipelines.images import ImagesPipeline
class ArticleImagePipeline(ImagesPipeline):
    def item_completed(self, results, item, info):
        if "front_image_url" in item:
            for ok, value in results:
                image_file_path = value["path"]
            item["front_image_path"] = image_file_path

        return item
         
         
 # 图片下载的命名最好使用md5加密算法,实现唯一字符串
 import hashl
 def get_md5(url):
     if isinstance(url,str):
         url = url.encode("utf-8")
     m = hashlib.md5()
     m.update(url)
     return m.hexdigest()
         
 
 article_item["url_object_id"] = get_md5(response.url)
         
         
 
 # 最后,pipline一共是两个,我们一定要将图片下载的pipline后面的数值设置为最下啊,这样就会最先走图片处理这一个pipline
         

pipelines文件导出成json格式

# pipelines.py中
from scrapy.exporters import JsonItemExporter
class JsonExporterPipleline(object):
    #调用scrapy提供的json export导出json文件
    def __init__(self):
        self.file = open('articleexport.json', 'wb')
        self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

在设计存储表的时候,可以将md5加密后的字段作为主键

twisted异步IO网络框架

class MysqlTwistedPipline(object):
    def __init__(self, dbpool):
        self.dbpool = dbpool

    @classmethod
    def from_settings(cls, settings):
        dbparms = dict(
            host = settings["MYSQL_HOST"],
            db = settings["MYSQL_DBNAME"],
            user = settings["MYSQL_USER"],
            passwd = settings["MYSQL_PASSWORD"],
            charset='utf8',
            cursorclass=MySQLdb.cursors.DictCursor,
            use_unicode=True,
        )
        dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)

        return cls(dbpool)
    def process_item(self, item, spider):
        #使用twisted将mysql插入变成异步执行
        query = self.dbpool.runInteraction(self.do_insert, item)
        query.addErrback(self.handle_error, item, spider) #处理异常

    def handle_error(self, failure, item, spider):
        # 处理异步插入的异常
        print (failure)

    def do_insert(self, cursor, item):
        #执行具体的插入
        #根据不同的item 构建不同的sql语句并插入到mysql中
        insert_sql, params = item.get_insert_sql()
        print (insert_sql, params)
        cursor.execute(insert_sql, params)

scrapy-djangoitem

让我们像使用django model一样去操作数据库

github.com/scrapy-plug…

scrapy item node

# jobbole.py
mport re
import scrapy
import datetime
from scrapy.http import Request
from urllib import parse
from scrapy.loader import ItemLoader
import time

from ArticleSpider.items import JobBoleArticleItem, ArticleItemLoader

from ArticleSpider.utils.common import get_md5


class JobboleSpider(scrapy.Spider):
    name = "jobbole"
    allowed_domains = ["python.jobbole.com"]
    start_urls = ['http://python.jobbole.com/all-posts/']

    def parse(self, response):
        """
        1. 获取文章列表页中的文章url并交给scrapy下载后并进行解析
        2. 获取下一页的url并交给scrapy进行下载, 下载完成后交给parse
        """

        #解析列表页中的所有文章url并交给scrapy下载后并进行解析
        post_nodes = response.css("#archive .floated-thumb .post-thumb a")
        for post_node in post_nodes:
            image_url = post_node.css("img::attr(src)").extract_first("")
            post_url = post_node.css("::attr(href)").extract_first("")
            yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url":image_url}, callback=self.parse_detail)

        #提取下一页并交给scrapy进行下载
        next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
        if next_url:
            yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)

    def parse_detail(self, response):
        article_item = JobBoleArticleItem()
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")

        article_item = item_loader.load_item()


        yield article_item
        

  # items.py
def add_jobbole(value):
    return value+"-bobby"


def date_convert(value):
    try:
        create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date()
    except Exception as e:
        create_date = datetime.datetime.now().date()

    return create_date


def get_nums(value):
    match_re = re.match(".*?(\d+).*", value)
    if match_re:
        nums = int(match_re.group(1))
    else:
        nums = 0

    return nums

def return_value(value):
    return value


def remove_comment_tags(value):
    #去掉tag中提取的评论
    if "评论" in value:
        return ""
    else:
        return value

class ArticleItemLoader(ItemLoader):
    #自定义itemloader
    default_output_processor = TakeFirst()


class JobBoleArticleItem(scrapy.Item):
    title = scrapy.Field()
    create_date = scrapy.Field(
        input_processor=MapCompose(date_convert),
    )
    url = scrapy.Field()
    url_object_id = scrapy.Field()
    front_image_url = scrapy.Field(
        output_processor=MapCompose(return_value)
    )
    front_image_path = scrapy.Field()
    praise_nums = scrapy.Field(
        input_processor=MapCompose(get_nums)
    )
    comment_nums = scrapy.Field(
        input_processor=MapCompose(get_nums)
    )
    fav_nums = scrapy.Field(
        input_processor=MapCompose(get_nums)
    )
    tags = scrapy.Field(
        input_processor=MapCompose(remove_comment_tags),
        output_processor=Join(",")
    )
    content = scrapy.Field()

    def get_insert_sql(self):
        insert_sql = """
            insert into jobbole_article(title, url, create_date, fav_nums, front_image_url, front_image_path,
            praise_nums, comment_nums, tags, content)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(fav_nums)
        """

        fron_image_url = ""
        # content = remove_tags(self["content"])

        if self["front_image_url"]:
            fron_image_url = self["front_image_url"][0]
        params = (self["title"], self["url"], self["create_date"], self["fav_nums"],
                  fron_image_url, self["front_image_path"], self["praise_nums"], self["comment_nums"],
                  self["tags"], self["content"])
        return insert_sql, params

requests_html爬取美桌

import hashlib
import os
import time

import requests
import requests_html
import re
import pandas
import numpy
import scrapy
from fake_useragent import UserAgent
import logging
import threading
from concurrent.futures.thread import ThreadPoolExecutor

"""
1 进入rili 的url
2 for element in elements:
     2.1获取图标的url,进入之后获取大图
     2.2 解析图片,每一组图片都是一个dict = {“相册名字”:[url1.jpg,url2.jpg,...]}
     2.3 列表中存储图片url[dict,dict,...]
3 循环之后,获取next元素,有的话进入,再次执行2
4 下载图片使用字典的key作为文件夹的名字,md5作为图片的名字

"""


class Spider:

    def __init__(self):
        self.ua = UserAgent()
        self.session = requests_html.HTMLSession()
        self.headers = {
            'Cookie': 'XSRF-TOKEN=eyJpdiI6IldLODA2ekVEb0pXV2JBVHF0UjBteEE9PSIsInZhbHVlIjoiRkhrRm0rTkRXSVNJbXViNlpqMHdHZ0lEWkJcL2RYdFNPUHE4dEE0dytCNm8xNGI5WTk1ckpSbWVlV2FzM3hqWU9GK013V1hpTGpKbklSK0diUGJkczlcLzU3aUd2Y2NJdEhta1NQVTQ1NTZidTlGK2NYU3dlMGZTeUhpTDg4WFg0TSIsIm1hYyI6ImViNjlhYWY2ZDQ0MTRjMGZlNTUxZGEwZjFlYTczYWNiZjgyYzUxMTliYmUzNmZmYjU4Zjc0OWE4OGIxNzNkZjkifQ%3D%3D; win4000_session=eyJpdiI6IlMya3J1MERXM1A3VVg5ejk0djNsYkE9PSIsInZhbHVlIjoiQnhxZkZ4ZVFrOFUybVNlUDFRcHFla2thOFVFdkFPTHZlT1B6Z2FNbWtUWks0SkZpVVVmbm1MMXh2ZzMzRTZ2bDdTcjhUNm9nc1VmbVBVaHNcL2c2OHhSK0hYM3BBaVpESFwvaWs4d3A5aWFQWnNndGVKS1pjVmlkWU9reFEzSjhwUiIsIm1hYyI6ImYzYjU5YTBmNWNiMjk2MjVmM2Q4ZTc3NmJiNmU3MjllZjIxN2EzZjkzY2JkMDhkYjljYWQ0Y2FlMGEyYWQ0ZmUifQ%3D%3D; security_session_verify=96061df658b4b77469e02c406543b820',
            'Connection': 'keep-alive',
            'Referer': 'http://www.win4000.com/wallpaper.html',
            'Host': 'www.win4000.com'
        }
        self.base_url = 'http://www.win4000.com/wallpaper_0_0_0_1.html'

    def parse(self):
        logging.info("解析")
        headers = self.headers.update({'User-Agent': self.ua.random})
        response = self.session.get(headers=headers, url=self.base_url, )
        with ThreadPoolExecutor(10) as executor:

            for btn in response.html.find('body > div.main > div > div.product_query > div:nth-child(1) > div > div>a')[1:]:
                print(btn)
                print(btn.text)
                executor.submit(self.parse1, btn)

    def parse1(self, btn):
        base_url = btn.attrs['href']
        folder_name = btn.text
        headers = self.headers.update({'User-Agent': self.ua.random})
        response = self.session.get(headers=headers, url=base_url, )
        last_page = response.html.find('.pages a')[-2].text  # 最后一页的数
        for i in range(1, int(last_page) + 1):
            self.base_url = self.base_url.replace('.html', f'_{i}.html')
            headers = self.headers.update({'User-Agent': self.ua.random})
            response = self.session.get(headers=headers, url=base_url, )
            elements = response.html.find('.Left_list_cont .tab_tj .tab_box .clearfix li a')

            # 循环当前页每一个图
            num = 0
            for element in elements:
                num += 1
                self.into_big_img(element, folder_name)

    def into_big_img(self, element, folder_name):
        logging.info("进入相册")
        """
        进入图标,获取大图中的相册,变成字典
        :param sub_url:
        :return:
        """
        sub_url = element.attrs['href']
        headers = self.headers.update({'User-Agent': self.ua.random})
        response = self.session.get(headers=headers, url=sub_url)
        max_page_num = int(response.html.find('.ptitle em', first=True).text)  # 获取大图的页数
        # 获取大图的标题
        title = response.html.find('.ptitle h1', first=True).text \
                + '(' + response.html.find('.ptitle span', first=True).text \
                + '-' + str(max_page_num) + ')'

        img_dict = {title: []}
        for page in range(1, max_page_num + 1):  # 循环获取每一个大图
            url = sub_url.split('.html')[0] + '_' + str(page) + ".html"
            headers = self.headers.update({'User-Agent': self.ua.random})
            response = self.session.get(headers=headers, url=url)
            img_url = response.html.find('.pic-meinv a img')[0].attrs['src']
            img_dict[title].append(img_url)
        self.download_img(img_dict, folder_name)

    # 下载,分目录存放

    def download_img(self, img_all_dict, folder_name):
        logging.info("下载")
        for k, v in img_all_dict.items():
            path = f"E:/美桌图片/{folder_name}" + "/" + k
            folder = os.path.exists(path)
            print(folder)
            if not folder:
                os.makedirs(path)
            for img in v:
                content = requests.get(img).content
                img_name = str(self.hash_md5())
                f = open(f'{path}/{img_name}.jpg', 'ab')  # 存储图片,多媒体文件需要参数b(二进制文件)
                f.write(content)  # 多媒体存储content
                f.close()

    def hash_md5(self):
        m = hashlib.md5()  # 定义加密对象
        mad_str = str(time.time())
        m.update(mad_str.encode('utf-8'))
        img_name = m.hexdigest()
        return img_name


if __name__ == '__main__':
    spider = Spider()
    spider.parse()
    # spider.into_big_img()
    # spider.download_img()