scrapy 初体验(这车很稳)

154 阅读1分钟
原文链接: blog.csdn.net

scrapy 爬虫

目标把gank上的图片趴下来

// 初始化项目
scrapy startproject demo 

修改items对象

import scrapy
import os
import requests


class GankItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    imageurl = scrapy.Field()
    url = scrapy.Field()
    pass

    def canParse(self):
        return self['name'] != '' and self['imageurl'] != ''

    def downLoad(self, imagepath):
        filename = 'file'
        files = self['url'].split("/ ")
        if len(files) > 3:
            filename = files[len(files) - 3] + "- " + files[len(files) - 2] + "- " + files[len(files) - 1]
        suffix = "jpg "
        data = self['imageurl'].split(". ")

        if len(data) >= 2:
            suffix = data[len(data) - 1]

        path = imagepath + "/ " + filename + ". " + suffix
        if not os.path.exists(path):
            print('下载文件')
            with open(path, 'wb') as fp:
                r = requests.get(self['imageurl'])
                fp.write(r.content)

piplines

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

imags="./images "

class GankPipeline(object):
    def process_item(self, item, spider):
        if item.canParse():
            item.downLoad(imags)

        pass

新建ganksprider


import scrapy

from demo.spiders.gank import GankItem

class GankSpider(scrapy.Spider,count=1):
    name="gank "

    allowed_domains = ["gank.io "]
    start_urls=["https://gank.io/2018/10/22 "]
    def parse(self, response):
        item=GankItem()
        item['url'] = response.url
        item['name']=response.xpath('//div[@class="container content "]/h1/text()').extract()[0]
        item['imageurl']=response.xpath('//div[@class="container content "]/div[@class="outlink "]//p/img/@src').extract()[0]

        return item
        newcontent =response.xpath('//div[@class="container content "]/div[@class="row "]/div[@class="six columns "]/p[@style="text-align: right "]/a/@href').extract_first()
        if newcontent:
            newurl="https://gank.io "+newcontent
            print(newurl)
            yield scrapy.Request(newurl, callback=self.parse)



修复setting 打开

ITEM_PIPELINES = {
   'gank.pipelines.GankPipeline': 300,
}

就跑起来了

scrapy crawl xxx