【爬虫系列】scrapy爬虫之猎聘招聘信息

946 阅读5分钟

声明:本文只作学习研究,禁止用于非法用途,否则后果自负,如有侵权,请告知删除,谢谢!



@[TOC](scrapy爬虫之猎聘招聘信息爬取


1.项目场景


目标网址:www.liepin.com/zhaopin/?ke…

2.准备工作


2.1 创建scrapy工程:scrapy startproject liepin_spider
2.2 创建scrapy爬虫:scrapy genspider liepin 'www.liepin.com/zhaopin/'
2.3 配置settings、代理、数据库连接等

3.页面分析

3.1 如下图

在这里插入图片描述

4.编写代码


4.1 爬虫代码

# -*- coding: utf-8 -*-
import scrapy
from liepin_spider.items import LiepinSpiderItem
from sql import MyMysql


class LiepinSpider(scrapy.Spider):
    name = 'liepin'
    custom_settings = {
        'ITEM_PIPELINES': {
            'liepin_spider.pipelines.LiepinSpiderPipeline': 200,
        },
        'DOWNLOADER_MIDDLEWARES': {
            # 'liepin_spider.middlewares.LiepinSpiderDownloaderMiddleware': 100,
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 'liepin_spider.middlewares.MyRetryMiddleware': 110,
        },
        'CONCURRENT_REQUESTS': 1, # 请求并发数
        'DOWNLOAD_DELAY': 1,      # 请求延时
        'DOWNLOAD_TIMEOUT' : 5,   # 请求超时
    }

    def __init__(self, s_type=None, c_type=None, *args, **kwargs): #传参选取爬取模式
        super(LiepinSpider, self).__init__(*args, **kwargs)
        self.c_type = c_type #爬取时间类型 1:每天 2:每月
        self.s_type = s_type #爬取关键词类型


    def start_requests(self):
        mysql = MyMysql()
        if self.c_type == '1':
            keys = mysql.read_many('select zwmc from lp_job_names where type = 0')# 获取数据库关键词,爬取部分
        else:
            keys = mysql.read_many('select zwmc from lp_job_names ')# 获取数据库关键词,全部爬取
        
        if self.s_type=='0':
            quyu_info = mysql.read_many('select dqs,city from lp_job_areas')  # 爬取全部地区数据
        else:
            quyu_info = mysql.read_many('select dqs,city from lp_job_areas where type = {}'.format(self.s_type))# 爬取部分地区数据

        for key in keys:
            for quyu in quyu_info:
                print("搜索关键词为:" + key[0] + "  当前搜索地区为:"+quyu[1])
                params = (
                    ('dqs', quyu[0]),  # 地区参数
                    ('key', key[0]),#搜索关键词
                    ('curPage', '0'),
                    ('pubTime','1'),#一天以内
                    ('jobKind','2')#职位类型
                )
                url = 'https://www.liepin.com/zhaopin/'
                yield scrapy.FormRequest(url=url,method='GET',formdata=params,callback=self.parse,meta={"key_info":key[0],"diqu_info":quyu[1]},dont_filter=True)
            # 当前关键词请求完毕,修改数据库状态
            print("职位:"+key[0]+"查询完毕")
            mysql.update("UPDATE lp_job_names set type = '1' WHERE zwmc = '%s' " % key[0])
        #重置职位状态
        mysql.update("UPDATE lp_job_names set type = '0' WHERE type = '1'")


    def parse(self, response):
        urls = response.xpath('//div[@class="job-info"]/h3/a/@href').getall() # 所有招聘链接
        key = response.meta["key_info"]
        diqu = response.meta["diqu_info"]
        if len(urls) == 0:
            print(diqu+"没有职位:"+key)
            pass # 查询不到数据则跳过
        else:
            for url in urls: # 循环获取招聘信息
                if 'https://www.liepin.com' not in url:
                    url = 'https://www.liepin.com'+ url
                yield scrapy.Request(url=url,callback=self.get_data,meta={"key_info":key,"diqu_info":diqu},dont_filter=True )

            next_url = 'https://www.liepin.com' + response.xpath('//div[@class="pagerbar"]/a/@href').getall()[-2] # 下一页地址

            if 'javascript:' in next_url: # 判断是否有下一页
                pass
            else:
                print(diqu+"岗位:"+key+" 下一页地址",next_url)
                yield scrapy.Request(url=next_url,meta={"key_info":key,"diqu_info":diqu},callback=self.parse,dont_filter=True )

    def get_data(self,response):
        items = LiepinSpiderItem()
        items['key_word'] = response.meta["key_info"]#搜索关键词
        items['diqu']= response.meta["diqu_info"]#搜索地区
        items['c_type']= self.c_type
        items['zhiwei']= response.xpath('//h1/text()').get()#职位
        if items['zhiwei'] is None:
            pass
        else:
            items['company'] = response.xpath('//div[@class="title-info"]/h3/a/text()').get()#公司名称
            items['salary']= ''.join(response.xpath('//p[@class="job-item-title"]/text()').getall()).strip()#薪资
            try:
                items['fb_time']= response.xpath('//p[@class="basic-infor"]/time/@title').get() + response.xpath('//p[@class="basic-infor"]/time/text()').get().strip() #发布时间
            except:
                items['fb_time'] = ''
            items['requirement']= '#'.join(response.xpath('//div[@class="job-title-left"]/div[@class="job-qualifications"]/span/text()').getall()) #要求
            items['welfare']= '#'.join(response.xpath('//div[@class="comp-tag-box"]/ul/li/span/text()').getall()) #福利
            items['job_description']= ''.join(response.xpath('//div[@class="content content-word"]/text()').getall()).strip() #职位描述
            items['log_url']= response.xpath('//div[@class="company-logo"]/a/@href').get() #公司logo地址
            items['industry']= response.xpath('//ul[@class="new-compintro"]/li[1]/a/text()').get() #行业
            company_info = response.xpath('//ul[@class="new-compintro"]/li//text()').getall()
            items['company_size'] = items['company_addr'] = ''
            for num in range(3,len(company_info)):
                if '公司规模' in company_info[num]:
                    items['company_size']= company_info[num].replace('公司规模:','') #公司规模
                else:
                    items['company_addr']= company_info[num].replace('公司地址:','') #公司地址
            # yield items
            print(items)



4.2 数据库连接代码
# -*- coding: utf-8 -*-
import pymysql


class MyMysql:
    def __init__(self):
        self.host = 'xxxxx' # ip
        self.port = 3306             # 端口
        self.user = 'xxxx'            # 用户名
        self.password = 'xxxx'  # 密码
        self.dbname = 'xxxx'          # 数据库名
        self.charset = 'utf8mb4'     # 字符类型

        # 链接数据库
        self.connect()

    def connect(self):
        # 链接数据库和获取游标
        self.db = pymysql.connect(host=self.host, port=self.port, user=self.user, password=self.password,
                                  db=self.dbname, charset=self.charset)
        self.cursor = self.db.cursor()

    def run(self, sql):
        ret = None
        try:
            ret = self.cursor.execute(sql)
            self.db.commit()
        except Exception as e:
            self.db.rollback()
        # finally:
        #     self.close()
        return ret

    def rollback(self):
        self.db.rollback()
        self.close()

    def close(self):
        self.cursor.close()
        self.db.close()

    def insert(self, sql):
        try:
            self.cursor.execute(sql)
            self.db.commit()
        except pymysql.err.IntegrityError:
            pass

    def commit(self):
        self.db.commit()
        self.close()

    def update(self, sql):
        return self.run(sql)

    def delete(self, sql):
        return self.run(sql)

    def read_one(self, sql):
        ret = None
        try:
            self.cursor.execute(sql)
            # 获取得到数据
            ret = self.cursor.fetchone()
        except Exception as e:
            # print('查询失败')
            pass
        # finally:
        #     self.close()
        return ret

    def read_many(self, sql):
        ret = None
        try:
            self.cursor.execute(sql)
            # 获取得到数据
            ret = self.cursor.fetchall()
        except Warning as e:
            print('查询失败')
        finally:
            pass
        return ret


3.3 Items代码

# -*- coding: utf-8 -*-
import scrapy


class LiepinSpiderItem(scrapy.Item):
    key_word = scrapy.Field()#搜索关键词
    zhiwei = scrapy.Field()#招聘职位
    company = scrapy.Field()#公司名称
    salary =  scrapy.Field()#薪资
    diqu = scrapy.Field()#地区
    fb_time = scrapy.Field()#发布时间
    requirement = scrapy.Field()#要求
    welfare = scrapy.Field()#福利
    job_description = scrapy.Field()#职位描述
    log_url = scrapy.Field()#公司logo地址
    industry = scrapy.Field()#行业
    company_size = scrapy.Field()#公司规模
    company_addr = scrapy.Field()#公司地址
    c_type = scrapy.Field()#爬取时间类型

4.4 Pipelines代码
# -*- coding: utf-8 -*-

import emoji
import pymysql
from pymongo import MongoClient
from twisted.enterprise import adbapi
import copy

adbparams_info = dict(
            host='xxxx',	# ip
            db='xxxx',		 # 数据库名
            user='xxxx',		# 用户名
            password='xxxx',	  # 密码
            charset='utf8',
            cursorclass=pymysql.cursors.DictCursor  # 指定cursor类型
        )

class LiepinSpiderPipeline(object):
    '''异步写入'''
    def __init__(self,dbpool):
        self.dbpool = dbpool
        self.conn = MongoClient('xxxx', 27017) # mongo的链接,可以不用

    @classmethod
    def from_settings(cls,settings):
        # 先将setting中连接数据库所需内容取出,构造一个地点
        adbparams = adbparams_info
        dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
        # 返回实例化参数
        return cls(dbpool)

    def process_item(self,item,spider):
        # 使用Twisted异步的将Item数据插入数据库
        item1 = copy.deepcopy(item)
        # 存入mysql
        self.dbpool.runInteraction(self.do_insert, item1)  # 指定操作方法和操作数据
        # 存入mongo,获取的职位
        self.mongo_insert(item['zhiwei'])

    def mongo_insert(self,job):
        db = self.conn.crawlab_test  # 连接mongo数据库
        my_set = db.jobs # 存入集合
        data_test1 = {
            'job': job
        }
        my_set.insert_one(data_test1)

    def handle_error(self,failure,item):
        # 打印异步插入异常
        print(failure,"数据库异常")

    def do_insert(self, cursor, item):
        insert_sql = """insert into lp_job_data(key_word,zhiwei,company,salary,diqu,fb_time,requirement,welfare,
                                                  job_description,logo_url,industry,company_size,company_addr,type,c_type)
                             VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','5','%s')
                             """% (
            item['key_word'],item['zhiwei'],item['company'],item['salary'],item['diqu'],
            item['fb_time'],item['requirement'],item['welfare'],item['job_description'].replace("'",'"'),item['log_url'],
            item['industry'],item['company_size'],item['company_addr'],item['c_type'])
        try:
            cursor.execute(insert_sql)
        except:
            insert_sql = emoji.demojize(insert_sql)#替换emoj
            cursor.execute(insert_sql)

    def close_spider(self, spider):
        self.conn.close()


4.5 Middleware中间件代理配置代码
# -*- coding: utf-8 -*-

import json,time
import random
import requests
from liepin_spider import settings


class LiepinSpiderDownloaderMiddleware(object):
    def process_request(self, request, spider):
        # 设置随机请求头
        ua_random = random.choice(settings.USER_AGENT_LIST)
        request.headers['User-Agent'] = ua_random
        
        --设置代理请自行添加

    def process_exception(self, request, exception, spider):
		--异常捕捉,需重新加代理请求
        ua_random = random.choice(settings.USER_AGENT_LIST)
        request.headers['User-Agent'] = ua_random
        return request


4.6 运行代码
import os

def start_all():
    os.system('scrapy crawl liepin -a s_type="0" -a c_type="2"') #爬取全部地区数据

# def start_type1():
#     os.system('scrapy crawl liepin -a s_type="1" ') # 一天爬一次  (一线城市 + 新一线城市)
#
# def start_type2():
#     os.system('scrapy crawl liepin -a s_type="2"') # 一周爬一次   (二线城市)
#
# def start_type3():
#     os.system('scrapy crawl liepin -a s_type="3"') # 剩下的一月一次

def new_type():
    os.system('scrapy crawl liepin -a s_type="4" -a c_type="1"')  # 5个城市,部分岗位

if __name__ == '__main__':
    # start_type1()
    # start_type2()
    # start_type3()
    new_type()

5.运行代码


5.1 我们来看看最后运行的代码效果

在这里插入图片描述

5.2 数据基本都实现获取了,最后我把岗位关键词、地区数据、爬取结果表结构贴一下吧~

在这里插入图片描述 在这里插入图片描述 在这里插入图片描述