Scrapy爬虫框架3-meta传参

99 阅读4分钟

数据获取

之前在学习xpath中,学到了get()和getall()的用法,get()可以获得符合条件的第一条数据,getall()可以获得符合条件的所有数据,今天还学到了另外两种方法:

title = response.xpath('//div[@class="hd"]/a/span[1]/text()').get()    #满足条件的第一条数据
print(title)
title1 = response.xpath('//div[@class="hd"]/a/span[1]/text()').extract_first()
print(title1)
titles = response.xpath('//div[@class="hd"]/a/span[1]/text()').getall() #获取所有数据
print(titles)
titles1 = response.xpath('//div[@class="hd"]/a/span[1]/text()').extract()
print(titles1)

输出结果:

image.png

get() 和getall() ⽅法是新引⼊的⽅法,⽤于代替extract⽅法,效果是⼀样的,主要是让代 码更加清晰明了,建议优先使⽤。

数据存储

以下演示了几种不同的存储方式,不管哪种方式,在设置之后,都需要在settings里面新建一个存储通道

image.png

1.txt存储

# def __init__(self): #与同下代码功能同效
#     self.file=open('douban_movie.txt','w+',encoding='utf-8')
def open_spider(self,spider): #爬⾍项⽬启动时只执⾏⼀次,⼀般⽤于数据库连接
    self.file=open('douban_movie.txt','w+',encoding='utf-8')


def process_item(self, item, spider):
    # self.file.write(str(item) + '\n') #写入的法1
    adapter=ItemAdapter(item)  # 创建ItemAdapter对象
    title = adapter.get('title')
    rating=adapter.get('rating')
    self.file.write(f'电影名称:{title}------电影评分:{rating}\n')
    print('item',item)
    #与爬虫相关的一些信息
    # print(spider.name,spider.allowed_domains)
    return item

def close_spider(self,spider):
    self.file.close()
    print('爬虫结束关闭文件')

2.csv存储

import csv
class CsvDoubanTopPipeline:
    def __init__(self):
        self.file=open('douban_movie.csv','w+',encoding='utf-8',newline='')
        self.csv_file=csv.writer(self.file)   #打开文件的权限给到csv
        self.csv_file.writerow(['电影名称','电影评分'])

    def process_item(self, item, spider):
        # self.file.write(str(item) + '\n') #写入的法1
        adapter = ItemAdapter(item)  # 创建ItemAdapter对象
        title = adapter.get('title')
        rating = adapter.get('rating')
        self.csv_file.writerow([title,rating])

        return item  # 因为存在多管道,将数据传输到下一管道

    def close_spider(self, spider):
        self.file.close()
        print('爬虫结束关闭文件')

3.excel文件存储

from openpyxl import Workbook
class ExcelCsvDoubanTopPipeline:
    def __init__(self):
       self.wb= Workbook()      #Workbook默认生成一个空的工作簿
       self.ws = self.wb.active # 取得当前活动worksheet对象
       self.ws.append(['电影名称','电影评分'])  #添加表头


    def process_item(self,item,spider):
        adapter = ItemAdapter(item)  # 创建ItemAdapter对象
        title = adapter.get('title')
        rating = adapter.get('rating')
        self.ws.append([title, rating])  #excel中的添加数据是append而不是writerow
        return item

    def close_spider(self,spider):
        self.wb.save('data.xlsx')       #将数据存储到data文件中
        print('爬虫结束关闭文件')

4.mysql存储

import pymysql
class MysqlDoubanTopPipeline:
    def __init__(self):
        self.db=pymysql.Connect(        #连接数据库
            user='root',                #账号
            password='123456',          #密码
            database='python_mysql'     #需要连接数据库的名称
        )

        self.cursor = self.db.cursor()  #创建游标

    def process_item(self,item,spider):
        adapter = ItemAdapter(item)  # 创建ItemAdapter对象
        title = adapter.get('title')
        rating = adapter.get('rating')

        sql='insert into douban(title,rating) VALUE (%s,%s)' #%s是占位符
        self.cursor.execute(sql,(title,rating))              #执行语句
        self.db.commit()                                     #提交

        return item

    def close_spider(self, spider):
        self.db.close()

meta参数

案例:在careers.tencent.com/search.html… 中爬取到【岗位名称】,【岗位职责】,【岗位要求】

tencent.py文件:

# import scrapy
# from ..items import TxItem
#
# class TencentSpider(scrapy.Spider):
#     name = "tencent"
#     allowed_domains = ["careers.tencent.com"]
#     start_urls = ["https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1705390036639&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=40001&attrId=&keyword=&pageIndex=1&pageSize=10&language=zh-cn&area=cn"]
#
#     def parse(self, response):
#         data = response.json()
#         for i in data['Data']['Posts']:
#             # RecruitPostName=i['RecruitPostName']
#             PostId=i['PostId']   #用主页面的PostId来生成子链接
#             print(i)
#
#             # workyear=i['RequireWorkYearsName']
#
#             detail_url=f'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1705390548381&postId={PostId}&language=zh-cn'
#             yield scrapy.Request(url=detail_url,callback=self.detail_parse)
#
#     def detail_parse(self,response):
#         #详细数据返回,岗位职责,岗位要求
#         json_data=response.json()
#         item = TxItem()      #生成一个对象
#
#         item['RecruitPostName']=json_data['Data']['RecruitPostName']  #岗位名称
#         item['Requirement']=json_data['Data']['Requirement']  #岗位要求
#         item['Responsibility']=json_data['Data']['Responsibility']  #岗位职责
#         yield item


#使用meta传参(法2import scrapy
from ..items import TxItem

class TencentSpider(scrapy.Spider):
    name = "tencent"
    allowed_domains = ["careers.tencent.com"]
    start_urls = [f"https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1705390036639&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=40001&attrId=&keyword=&pageIndex={i}&pageSize=10&language=zh-cn&area=cn" for i in range(1,51)]

    def parse(self, response):
        data = response.json()
        for i in data['Data']['Posts']:
            item = TxItem()  # 生成一个对象

            item['RecruitPostName']=i['RecruitPostName']  #1.先创建item对象,再将招聘名称进行存储,再将名称放进子类存储
            # RecruitPostName=i['RecruitPostName']
            PostId=i['PostId']   #用主页面的PostId来生成子链接


            # workyear=i['RequireWorkYearsName']

            detail_url=f'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1705390548381&postId={PostId}&language=zh-cn'
            yield scrapy.Request(url=detail_url,callback=self.detail_parse
                                 ,meta={'item':item})   #2.将数据传输到detail_parsedef detail_parse(self,response):
        #详细数据返回,岗位职责,岗位要求
        json_data=response.json()
        item = response.meta.get('item') #3.获取传输过来的数据

        # item['RecruitPostName']=json_data['Data']['RecruitPostName']  #岗位名称
        item['Requirement']=json_data['Data']['Requirement']  #岗位要求
        item['Responsibility']=json_data['Data']['Responsibility']  #岗位职责
        yield item

items文件:

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class TxItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    RecruitPostName=scrapy.Field()
    Requirement=scrapy.Field()
    Responsibility=scrapy.Field()

pipelines文件:

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from csv import writer

class TxPipeline:
    def __init__(self):
        self.file=open('tx.csv','w+',encoding='utf-8',newline='')
        self.csv_file=writer(self.file)
        self.csv_file.writerow(['职位名称','任职要求','负责工作'])  #添加表头

    def process_item(self, item, spider):
        adapter=ItemAdapter(item)

        self.csv_file.writerow([adapter.get('RecruitPostName'),
                                adapter.get('Requirement'),
                                adapter.get('Responsibility'),
                                ])                             #写入数据
        return item


    def close_spider(self,spider):
        self.file.close()