声明:本文只作学习研究,禁止用于非法用途,否则后果自负,如有侵权,请告知删除,谢谢!
@[TOC](scrapy爬虫之猎聘招聘信息爬取
1.项目场景
目标网址:www.liepin.com/zhaopin/?ke…
2.准备工作
2.1 创建scrapy工程:scrapy startproject liepin_spider
2.2 创建scrapy爬虫:scrapy genspider liepin 'www.liepin.com/zhaopin/'
2.3 配置settings、代理、数据库连接等
3.页面分析
3.1 如下图
4.编写代码
4.1 爬虫代码
# -*- coding: utf-8 -*-
import scrapy
from liepin_spider.items import LiepinSpiderItem
from sql import MyMysql
class LiepinSpider(scrapy.Spider):
name = 'liepin'
custom_settings = {
'ITEM_PIPELINES': {
'liepin_spider.pipelines.LiepinSpiderPipeline': 200,
},
'DOWNLOADER_MIDDLEWARES': {
# 'liepin_spider.middlewares.LiepinSpiderDownloaderMiddleware': 100,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
# 'liepin_spider.middlewares.MyRetryMiddleware': 110,
},
'CONCURRENT_REQUESTS': 1, # 请求并发数
'DOWNLOAD_DELAY': 1, # 请求延时
'DOWNLOAD_TIMEOUT' : 5, # 请求超时
}
def __init__(self, s_type=None, c_type=None, *args, **kwargs): #传参选取爬取模式
super(LiepinSpider, self).__init__(*args, **kwargs)
self.c_type = c_type #爬取时间类型 1:每天 2:每月
self.s_type = s_type #爬取关键词类型
def start_requests(self):
mysql = MyMysql()
if self.c_type == '1':
keys = mysql.read_many('select zwmc from lp_job_names where type = 0')# 获取数据库关键词,爬取部分
else:
keys = mysql.read_many('select zwmc from lp_job_names ')# 获取数据库关键词,全部爬取
if self.s_type=='0':
quyu_info = mysql.read_many('select dqs,city from lp_job_areas') # 爬取全部地区数据
else:
quyu_info = mysql.read_many('select dqs,city from lp_job_areas where type = {}'.format(self.s_type))# 爬取部分地区数据
for key in keys:
for quyu in quyu_info:
print("搜索关键词为:" + key[0] + " 当前搜索地区为:"+quyu[1])
params = (
('dqs', quyu[0]), # 地区参数
('key', key[0]),#搜索关键词
('curPage', '0'),
('pubTime','1'),#一天以内
('jobKind','2')#职位类型
)
url = 'https://www.liepin.com/zhaopin/'
yield scrapy.FormRequest(url=url,method='GET',formdata=params,callback=self.parse,meta={"key_info":key[0],"diqu_info":quyu[1]},dont_filter=True)
# 当前关键词请求完毕,修改数据库状态
print("职位:"+key[0]+"查询完毕")
mysql.update("UPDATE lp_job_names set type = '1' WHERE zwmc = '%s' " % key[0])
#重置职位状态
mysql.update("UPDATE lp_job_names set type = '0' WHERE type = '1'")
def parse(self, response):
urls = response.xpath('//div[@class="job-info"]/h3/a/@href').getall() # 所有招聘链接
key = response.meta["key_info"]
diqu = response.meta["diqu_info"]
if len(urls) == 0:
print(diqu+"没有职位:"+key)
pass # 查询不到数据则跳过
else:
for url in urls: # 循环获取招聘信息
if 'https://www.liepin.com' not in url:
url = 'https://www.liepin.com'+ url
yield scrapy.Request(url=url,callback=self.get_data,meta={"key_info":key,"diqu_info":diqu},dont_filter=True )
next_url = 'https://www.liepin.com' + response.xpath('//div[@class="pagerbar"]/a/@href').getall()[-2] # 下一页地址
if 'javascript:' in next_url: # 判断是否有下一页
pass
else:
print(diqu+"岗位:"+key+" 下一页地址",next_url)
yield scrapy.Request(url=next_url,meta={"key_info":key,"diqu_info":diqu},callback=self.parse,dont_filter=True )
def get_data(self,response):
items = LiepinSpiderItem()
items['key_word'] = response.meta["key_info"]#搜索关键词
items['diqu']= response.meta["diqu_info"]#搜索地区
items['c_type']= self.c_type
items['zhiwei']= response.xpath('//h1/text()').get()#职位
if items['zhiwei'] is None:
pass
else:
items['company'] = response.xpath('//div[@class="title-info"]/h3/a/text()').get()#公司名称
items['salary']= ''.join(response.xpath('//p[@class="job-item-title"]/text()').getall()).strip()#薪资
try:
items['fb_time']= response.xpath('//p[@class="basic-infor"]/time/@title').get() + response.xpath('//p[@class="basic-infor"]/time/text()').get().strip() #发布时间
except:
items['fb_time'] = ''
items['requirement']= '#'.join(response.xpath('//div[@class="job-title-left"]/div[@class="job-qualifications"]/span/text()').getall()) #要求
items['welfare']= '#'.join(response.xpath('//div[@class="comp-tag-box"]/ul/li/span/text()').getall()) #福利
items['job_description']= ''.join(response.xpath('//div[@class="content content-word"]/text()').getall()).strip() #职位描述
items['log_url']= response.xpath('//div[@class="company-logo"]/a/@href').get() #公司logo地址
items['industry']= response.xpath('//ul[@class="new-compintro"]/li[1]/a/text()').get() #行业
company_info = response.xpath('//ul[@class="new-compintro"]/li//text()').getall()
items['company_size'] = items['company_addr'] = ''
for num in range(3,len(company_info)):
if '公司规模' in company_info[num]:
items['company_size']= company_info[num].replace('公司规模:','') #公司规模
else:
items['company_addr']= company_info[num].replace('公司地址:','') #公司地址
# yield items
print(items)
4.2 数据库连接代码
# -*- coding: utf-8 -*-
import pymysql
class MyMysql:
def __init__(self):
self.host = 'xxxxx' # ip
self.port = 3306 # 端口
self.user = 'xxxx' # 用户名
self.password = 'xxxx' # 密码
self.dbname = 'xxxx' # 数据库名
self.charset = 'utf8mb4' # 字符类型
# 链接数据库
self.connect()
def connect(self):
# 链接数据库和获取游标
self.db = pymysql.connect(host=self.host, port=self.port, user=self.user, password=self.password,
db=self.dbname, charset=self.charset)
self.cursor = self.db.cursor()
def run(self, sql):
ret = None
try:
ret = self.cursor.execute(sql)
self.db.commit()
except Exception as e:
self.db.rollback()
# finally:
# self.close()
return ret
def rollback(self):
self.db.rollback()
self.close()
def close(self):
self.cursor.close()
self.db.close()
def insert(self, sql):
try:
self.cursor.execute(sql)
self.db.commit()
except pymysql.err.IntegrityError:
pass
def commit(self):
self.db.commit()
self.close()
def update(self, sql):
return self.run(sql)
def delete(self, sql):
return self.run(sql)
def read_one(self, sql):
ret = None
try:
self.cursor.execute(sql)
# 获取得到数据
ret = self.cursor.fetchone()
except Exception as e:
# print('查询失败')
pass
# finally:
# self.close()
return ret
def read_many(self, sql):
ret = None
try:
self.cursor.execute(sql)
# 获取得到数据
ret = self.cursor.fetchall()
except Warning as e:
print('查询失败')
finally:
pass
return ret
3.3 Items代码
# -*- coding: utf-8 -*-
import scrapy
class LiepinSpiderItem(scrapy.Item):
key_word = scrapy.Field()#搜索关键词
zhiwei = scrapy.Field()#招聘职位
company = scrapy.Field()#公司名称
salary = scrapy.Field()#薪资
diqu = scrapy.Field()#地区
fb_time = scrapy.Field()#发布时间
requirement = scrapy.Field()#要求
welfare = scrapy.Field()#福利
job_description = scrapy.Field()#职位描述
log_url = scrapy.Field()#公司logo地址
industry = scrapy.Field()#行业
company_size = scrapy.Field()#公司规模
company_addr = scrapy.Field()#公司地址
c_type = scrapy.Field()#爬取时间类型
4.4 Pipelines代码
# -*- coding: utf-8 -*-
import emoji
import pymysql
from pymongo import MongoClient
from twisted.enterprise import adbapi
import copy
adbparams_info = dict(
host='xxxx', # ip
db='xxxx', # 数据库名
user='xxxx', # 用户名
password='xxxx', # 密码
charset='utf8',
cursorclass=pymysql.cursors.DictCursor # 指定cursor类型
)
class LiepinSpiderPipeline(object):
'''异步写入'''
def __init__(self,dbpool):
self.dbpool = dbpool
self.conn = MongoClient('xxxx', 27017) # mongo的链接,可以不用
@classmethod
def from_settings(cls,settings):
# 先将setting中连接数据库所需内容取出,构造一个地点
adbparams = adbparams_info
dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
# 返回实例化参数
return cls(dbpool)
def process_item(self,item,spider):
# 使用Twisted异步的将Item数据插入数据库
item1 = copy.deepcopy(item)
# 存入mysql
self.dbpool.runInteraction(self.do_insert, item1) # 指定操作方法和操作数据
# 存入mongo,获取的职位
self.mongo_insert(item['zhiwei'])
def mongo_insert(self,job):
db = self.conn.crawlab_test # 连接mongo数据库
my_set = db.jobs # 存入集合
data_test1 = {
'job': job
}
my_set.insert_one(data_test1)
def handle_error(self,failure,item):
# 打印异步插入异常
print(failure,"数据库异常")
def do_insert(self, cursor, item):
insert_sql = """insert into lp_job_data(key_word,zhiwei,company,salary,diqu,fb_time,requirement,welfare,
job_description,logo_url,industry,company_size,company_addr,type,c_type)
VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','5','%s')
"""% (
item['key_word'],item['zhiwei'],item['company'],item['salary'],item['diqu'],
item['fb_time'],item['requirement'],item['welfare'],item['job_description'].replace("'",'"'),item['log_url'],
item['industry'],item['company_size'],item['company_addr'],item['c_type'])
try:
cursor.execute(insert_sql)
except:
insert_sql = emoji.demojize(insert_sql)#替换emoj
cursor.execute(insert_sql)
def close_spider(self, spider):
self.conn.close()
4.5 Middleware中间件代理配置代码
# -*- coding: utf-8 -*-
import json,time
import random
import requests
from liepin_spider import settings
class LiepinSpiderDownloaderMiddleware(object):
def process_request(self, request, spider):
# 设置随机请求头
ua_random = random.choice(settings.USER_AGENT_LIST)
request.headers['User-Agent'] = ua_random
--设置代理请自行添加
def process_exception(self, request, exception, spider):
--异常捕捉,需重新加代理请求
ua_random = random.choice(settings.USER_AGENT_LIST)
request.headers['User-Agent'] = ua_random
return request
4.6 运行代码
import os
def start_all():
os.system('scrapy crawl liepin -a s_type="0" -a c_type="2"') #爬取全部地区数据
# def start_type1():
# os.system('scrapy crawl liepin -a s_type="1" ') # 一天爬一次 (一线城市 + 新一线城市)
#
# def start_type2():
# os.system('scrapy crawl liepin -a s_type="2"') # 一周爬一次 (二线城市)
#
# def start_type3():
# os.system('scrapy crawl liepin -a s_type="3"') # 剩下的一月一次
def new_type():
os.system('scrapy crawl liepin -a s_type="4" -a c_type="1"') # 5个城市,部分岗位
if __name__ == '__main__':
# start_type1()
# start_type2()
# start_type3()
new_type()
5.运行代码
5.1 我们来看看最后运行的代码效果
5.2 数据基本都实现获取了,最后我把岗位关键词、地区数据、爬取结果表结构贴一下吧~