数据获取
之前在学习xpath中,学到了get()和getall()的用法,get()可以获得符合条件的第一条数据,getall()可以获得符合条件的所有数据,今天还学到了另外两种方法:
title = response.xpath('//div[@class="hd"]/a/span[1]/text()').get() #满足条件的第一条数据
print(title)
title1 = response.xpath('//div[@class="hd"]/a/span[1]/text()').extract_first()
print(title1)
titles = response.xpath('//div[@class="hd"]/a/span[1]/text()').getall() #获取所有数据
print(titles)
titles1 = response.xpath('//div[@class="hd"]/a/span[1]/text()').extract()
print(titles1)
输出结果:
get() 和getall() ⽅法是新引⼊的⽅法,⽤于代替extract⽅法,效果是⼀样的,主要是让代 码更加清晰明了,建议优先使⽤。
数据存储
以下演示了几种不同的存储方式,不管哪种方式,在设置之后,都需要在settings里面新建一个存储通道
1.txt存储
# def __init__(self): #与同下代码功能同效
# self.file=open('douban_movie.txt','w+',encoding='utf-8')
def open_spider(self,spider): #爬⾍项⽬启动时只执⾏⼀次,⼀般⽤于数据库连接
self.file=open('douban_movie.txt','w+',encoding='utf-8')
def process_item(self, item, spider):
# self.file.write(str(item) + '\n') #写入的法1
adapter=ItemAdapter(item) # 创建ItemAdapter对象
title = adapter.get('title')
rating=adapter.get('rating')
self.file.write(f'电影名称:{title}------电影评分:{rating}\n')
print('item',item)
#与爬虫相关的一些信息
# print(spider.name,spider.allowed_domains)
return item
def close_spider(self,spider):
self.file.close()
print('爬虫结束关闭文件')
2.csv存储
import csv
class CsvDoubanTopPipeline:
def __init__(self):
self.file=open('douban_movie.csv','w+',encoding='utf-8',newline='')
self.csv_file=csv.writer(self.file) #打开文件的权限给到csv
self.csv_file.writerow(['电影名称','电影评分'])
def process_item(self, item, spider):
# self.file.write(str(item) + '\n') #写入的法1
adapter = ItemAdapter(item) # 创建ItemAdapter对象
title = adapter.get('title')
rating = adapter.get('rating')
self.csv_file.writerow([title,rating])
return item # 因为存在多管道,将数据传输到下一管道
def close_spider(self, spider):
self.file.close()
print('爬虫结束关闭文件')
3.excel文件存储
from openpyxl import Workbook
class ExcelCsvDoubanTopPipeline:
def __init__(self):
self.wb= Workbook() #Workbook默认生成一个空的工作簿
self.ws = self.wb.active # 取得当前活动worksheet对象
self.ws.append(['电影名称','电影评分']) #添加表头
def process_item(self,item,spider):
adapter = ItemAdapter(item) # 创建ItemAdapter对象
title = adapter.get('title')
rating = adapter.get('rating')
self.ws.append([title, rating]) #excel中的添加数据是append而不是writerow
return item
def close_spider(self,spider):
self.wb.save('data.xlsx') #将数据存储到data文件中
print('爬虫结束关闭文件')
4.mysql存储
import pymysql
class MysqlDoubanTopPipeline:
def __init__(self):
self.db=pymysql.Connect( #连接数据库
user='root', #账号
password='123456', #密码
database='python_mysql' #需要连接数据库的名称
)
self.cursor = self.db.cursor() #创建游标
def process_item(self,item,spider):
adapter = ItemAdapter(item) # 创建ItemAdapter对象
title = adapter.get('title')
rating = adapter.get('rating')
sql='insert into douban(title,rating) VALUE (%s,%s)' #%s是占位符
self.cursor.execute(sql,(title,rating)) #执行语句
self.db.commit() #提交
return item
def close_spider(self, spider):
self.db.close()
meta参数
案例:在careers.tencent.com/search.html… 中爬取到【岗位名称】,【岗位职责】,【岗位要求】
tencent.py文件:
# import scrapy
# from ..items import TxItem
#
# class TencentSpider(scrapy.Spider):
# name = "tencent"
# allowed_domains = ["careers.tencent.com"]
# start_urls = ["https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1705390036639&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=40001&attrId=&keyword=&pageIndex=1&pageSize=10&language=zh-cn&area=cn"]
#
# def parse(self, response):
# data = response.json()
# for i in data['Data']['Posts']:
# # RecruitPostName=i['RecruitPostName']
# PostId=i['PostId'] #用主页面的PostId来生成子链接
# print(i)
#
# # workyear=i['RequireWorkYearsName']
#
# detail_url=f'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1705390548381&postId={PostId}&language=zh-cn'
# yield scrapy.Request(url=detail_url,callback=self.detail_parse)
#
# def detail_parse(self,response):
# #详细数据返回,岗位职责,岗位要求
# json_data=response.json()
# item = TxItem() #生成一个对象
#
# item['RecruitPostName']=json_data['Data']['RecruitPostName'] #岗位名称
# item['Requirement']=json_data['Data']['Requirement'] #岗位要求
# item['Responsibility']=json_data['Data']['Responsibility'] #岗位职责
# yield item
#使用meta传参(法2)
import scrapy
from ..items import TxItem
class TencentSpider(scrapy.Spider):
name = "tencent"
allowed_domains = ["careers.tencent.com"]
start_urls = [f"https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1705390036639&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=40001&attrId=&keyword=&pageIndex={i}&pageSize=10&language=zh-cn&area=cn" for i in range(1,51)]
def parse(self, response):
data = response.json()
for i in data['Data']['Posts']:
item = TxItem() # 生成一个对象
item['RecruitPostName']=i['RecruitPostName'] #1.先创建item对象,再将招聘名称进行存储,再将名称放进子类存储
# RecruitPostName=i['RecruitPostName']
PostId=i['PostId'] #用主页面的PostId来生成子链接
# workyear=i['RequireWorkYearsName']
detail_url=f'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1705390548381&postId={PostId}&language=zh-cn'
yield scrapy.Request(url=detail_url,callback=self.detail_parse
,meta={'item':item}) #2.将数据传输到detail_parse中
def detail_parse(self,response):
#详细数据返回,岗位职责,岗位要求
json_data=response.json()
item = response.meta.get('item') #3.获取传输过来的数据
# item['RecruitPostName']=json_data['Data']['RecruitPostName'] #岗位名称
item['Requirement']=json_data['Data']['Requirement'] #岗位要求
item['Responsibility']=json_data['Data']['Responsibility'] #岗位职责
yield item
items文件:
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class TxItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
RecruitPostName=scrapy.Field()
Requirement=scrapy.Field()
Responsibility=scrapy.Field()
pipelines文件:
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from csv import writer
class TxPipeline:
def __init__(self):
self.file=open('tx.csv','w+',encoding='utf-8',newline='')
self.csv_file=writer(self.file)
self.csv_file.writerow(['职位名称','任职要求','负责工作']) #添加表头
def process_item(self, item, spider):
adapter=ItemAdapter(item)
self.csv_file.writerow([adapter.get('RecruitPostName'),
adapter.get('Requirement'),
adapter.get('Responsibility'),
]) #写入数据
return item
def close_spider(self,spider):
self.file.close()