Scrapy爬虫流程
问题: 如何实现翻页
requests模块是如何发送翻页的请求的?
- 找到下一页地址
- 之后调用requests.get(url)
思路: 1、找到下一页的地址 2、构造一个关于下一页url地址的request请求传递给调度器
爬取某网站尝试
创建项目
PS E:\Study\code\Python\网络爬虫\ScrapyStudy> scrapy startproject tencent
生成一个爬虫
PS E:\Study\code\Python\网络爬虫\ScrapyStudy\tencent> scrapy genspider hr careers.tencent.com
更换start_urls
class HrSpider(scrapy.Spider):
name = "hr"
allowed_domains = ["careers.tencent.com"]
# start_urls = ["https://careers.tencent.com"]
one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1731417323946&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'
start_urls = [one_url.format(1)]
def parse(self, response):
for page in range(1,11):
url = self.one_url.format(page)
这样就可以实现url的翻页功能了
发送请求
def parse(self, response):
for page in range(1,11):
url = self.one_url.format(page)
yield scrapy.Request(
url = url,
callback=self.parse_one
)
def parse_one(self,response):
data = json.loads(response.text)
print(data)
启动程序,就可以获取数据了
取数据
for job in data['Data']['Posts']:
print(job)
for job in data['Data']['Posts']:
# print(job)
item = {}
item['job_name'] = job['RecruitPostName']
item['address'] = job['LocationName']
item['CategoryName'] = job['CategoryName']
获取详情页数据
def parse_one(self, response):
data = json.loads(response.text)
# print(data)
for job in data['Data']['Posts']:
# print(job)
item = {}
item['job_name'] = job['RecruitPostName']
item['address'] = job['LocationName']
item['CategoryName'] = job['CategoryName']
post_id = job['PostId']
detail_url = self.two_url.format(post_id)
yield scrapy.Request(
url=detail_url,
meta={'item':item}, # 传递item
callback=self.parse_two
)
def parse_two(self,response):
# item = response.meta['item']
item = response.meta.get('item')
# print(item)
data = json.loads(response.text)['Data']
# print(data)
item['RecruitPostName'] = data['RecruitPostName']
print(item)
将item数据返回给pipeline
在settings中开启pipeline配置
可以正常返回数据
自定义管道用来保存文件
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import csv
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class TencentPipeline:
def open_spider(self,spider):
print("爬虫开始运行")
# 打开文件用于写入
self.file = open('./tencentJob.csv','w',newline='',encoding="utf-8")
# 写入表头
self.writer = csv.DictWriter(self.file,fieldnames=['job_name','address','CategoryName','RecruitPostName'])
self.writer.writeheader()
def process_item(self, item, spider):
# print(item)
# 将item写入文件
self.writer.writerow(item)
return item
def close_spider(self,spider):
# 数据写入成功
print("数据写入成功")
print("爬虫运行结束")
self.file.close()