10分钟—爬取招聘网招聘信息给电脑10分钟，它能做多少事情？电脑运算能力是常人无法想象的，就算再加上网络连接的时间，10

前言

前段时间一直在做web开发，突然想做回老本行，想做一下爬虫和数据分析，于是想到了招聘信息（可能愁什么就想到什么），毕业生应聘是一个大难题，所以希望能提供给大家一些清晰明了的信息，帮助大家找到好工作。

这篇文章主要讲解数据收集，后续会做出数据分析并分享给大家。文末有好东西介绍

一、确定目标

网上有很多招聘信息网，例如拉勾，58，智联等等，这次爬虫爬取的数据来源于拉勾网和智联两个网站，这两个网站分类都比较明确而且信息也比较多

二、战前准备（运行环境和库）

python3.5
pymongo（可用其他数据库替代）
multiprocessing
requests
BeautifulSoup
itertools

三、实战

下面的内容分为两部分，分别是爬取智联招聘网和拉勾网的代码

1.智联招聘

首先说一下爬取的步骤：

根据关键字、城市以及页面编号生成需要爬取的网页链接
获取相应的网页内容
解析网页内容，获取需要的信息
将爬取的信息保存到MongoDB(SQLite)数据库中
用multiprocessing启动多进程进行爬取，极大提高运行效率

爬虫主运行文件 "zhilian_spider.py"

# _*_ coding: utf-8 _*_
# Author: "DHC_King"
# Python 3.x

### 配置信息 ###
TOTAL_PAGE_NUMBER = 90  # PAGE_NUMBER: total number of pages，可进行修改
 
KEYWORDS = ['java','艺术','设计'] # 需爬取的关键字可以自己添加或修改
 
# 爬取主要城市的记录
ADDRESS = ['全国', '北京', '上海', '广州', '深圳',
           '天津', '武汉', '西安', '成都', '大连',
           '长春', '沈阳', '南京', '济南', '青岛',
           '杭州', '苏州', '无锡', '宁波', '重庆',
           '郑州', '长沙', '福州', '厦门', '哈尔滨',
           '石家庄', '合肥', '惠州', '太原', '昆明',
           '烟台', '佛山', '南昌', '贵阳', '南宁']
 
MONGO_URI = 'localhost'
MONGO_DB = 'zhilian'
###——————###

###主程序###
from datetime import datetime
from urllib.parse import urlencode
from multiprocessing import Pool
import requests
from bs4 import BeautifulSoup
import pymongo
from zhilian_kw_config import *
import time
from itertools import product
 
client = pymongo.MongoClient(MONGO_URI)
db = client[MONGO_DB]
 
def download(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'}
    response = requests.get(url, headers=headers)
    return response.text
 
def get_content(html):
    # 记录保存日期
    date = datetime.now().date()
    date = datetime.strftime(date, '%Y-%m-%d')  # 转变成str
 
    soup = BeautifulSoup(html, 'lxml')
    body = soup.body
    data_main = body.find('div', {'class': 'newlist_list_content'})
 
    if data_main:
        tables = data_main.find_all('table')
 
        for i, table_info in enumerate(tables):
            if i == 0:
                continue
            tds = table_info.find('tr').find_all('td')
            zwmc = tds[0].find('a').get_text()  # 职位名称
            zw_link = tds[0].find('a').get('href')  # 职位链接
            fkl = tds[1].find('span').get_text()  # 反馈率
            gsmc = tds[2].find('a').get_text()  # 公司名称
            zwyx = tds[3].get_text()  # 职位月薪
            gzdd = tds[4].get_text()  # 工作地点
            gbsj = tds[5].find('span').get_text()  # 发布日期
 
            tr_brief = table_info.find('tr', {'class': 'newlist_tr_detail'})
            # 招聘简介
            brief = tr_brief.find('li', {'class': 'newlist_deatil_last'}).get_text()
 
            # 用生成器获取信息
            yield {'zwmc': zwmc,  # 职位名称
                   'fkl': fkl,  # 反馈率
                   'gsmc': gsmc,  # 公司名称
                   'zwyx': zwyx,  # 职位月薪
                   'gzdd': gzdd,  # 工作地点
                   'gbsj': gbsj,  # 公布时间
                   'brief': brief,  # 招聘简介
                   'zw_link': zw_link,  # 网页链接
                   'save_date': date  # 记录信息保存的日期
                   }
 
def main(args):
    basic_url = '招聘（求职）尽在智联招聘?'  #自己修改
    
    for keyword in KEYWORDS:
        mongo_table = db[keyword]
        paras = {'jl': args[0],
                 'kw': keyword,
                 'p': args[1]  # 第X页
                 }
        url = basic_url + urlencode(paras)
        # print(url)
        html = download(url)
        # print(html)
        if html:
            data = get_content(html)
            for item in data:
                if mongo_table.update({'zw_link': item['zw_link']}, {'$set': item}, True):
                    print("爬取完成: ",keyword,item["zwmc"])
 
if __name__ == '__main__':
    start = time.time()
    number_list = list(range(TOTAL_PAGE_NUMBER))
    args = product(ADDRESS, number_list)
    pool = Pool()
    pool.map(main, args) # 多进程运行
    end = time.time()
    print('Finished, task runs %s seconds.' % (end - start))

感谢以上代码的主要贡献者：lemon

2.拉勾网

直接上代码（Python2.7，使用SQLite数据库 ）

import requests
from bs4 import BeautifulSoup
import sqlite3
import datetime
import json
import time
import random
import sys
import os

this_year = datetime.datetime.now().year
# 记录已经爬取的网页，保存为json文件
try:
  with open("%dparsed.json" %this_year) as f:
    running_status = json.load(f)
except:
  running_status = []

conn = sqlite3.connect("%d.db" %this_year)
cursor = conn.cursor()

headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
    "Cookie":"user_trace_token=20170713202208-e4038337-67c5-11e7-ba4d-525400f775ce; LGUID=20170713202208-e4038a60-67c5-11e7-ba4d-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAACEBACDG1BC39A67EE4674BEA5CC657D85B07E9F; TG-TRACK-CODE=index_navigation; X_MIDDLE_TOKEN=493e8c86da05060187202c1ab73933d2; X_HTTP_TOKEN=9e18de54125fdedc23f44e90ef2444c8; SEARCH_ID=81783bf1b1c24554b8a0c09547904c64; _gid=GA1.2.1242554573.1499919725; _gat=1; _ga=GA1.2.1892926406.1499919725; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1499921082,1500005352,1500058199,1500068071; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1500071706; LGSID=20170715133435-4958d6c9-691f-11e7-8990-525400f775ce; LGRID=20170715143510-c01adf2d-6927-11e7-a985-5254005c3644"}

# 洪流爬虫
def flood_spide_begin():
  URL = "拉勾网"
  response = requests.get(URL)
  soup = BeautifulSoup(response.content, "html5lib")

  # 左侧主菜单栏
  tag = soup.find("div", class_ = "mainNavs")

  # 部门类型
  menu_boxs = tag.find_all("div", class_ = "menu_box")
  positions_frame = [(i.find("div", class_ = "menu_main").h2.text.strip(),\
              i.find("div", class_ = "menu_sub")) for i in menu_boxs]  # 一个框里面的信息
  department_type = dict(positions_frame)

  try:
    for i in department_type:
      try:
        cursor.execute("create table %s (ptn,tech_name,link,zwmc,yx,rzyq,gsmc,fzjd,zwyh,dz)" %i.encode("utf-8"))
      except Exception as e:
        print e
        pass

      menu_sub = department_type[i]
      # 分开写
      list_profession(menu_sub, i.encode("utf-8"))
  
  finally:
    cursor.close()
    conn.commit()
    conn.close()


def list_profession(menu_sub, department_name):
  # 职业类型：key
  profession_type = menu_sub.find_all("dl")
  profession_type_name = (n.find("span").string.strip() for n in profession_type)
  # 技术类型
  technology = (n.find_all("a") for n in profession_type)
  for ptn,tec in zip(profession_type_name,technology):
    list_technology(ptn,tec, department_name)


def list_technology(ptn,tec, department_name):
  # 技术名
  technology_name = (n.text.strip() for n in tec)

  middleware = []
  all_pages = map(second_page_pages, tec)  # 分页
  for i in all_pages:
    middleware.extend(i)
  all_pages = middleware[:]
  del middleware

  print ptn
  for tech_name, tech_type in zip(technology_name,all_pages):
    list_result(department_name,ptn,tech_name, tech_type)

# 结果
def list_result(department_name, ptn, tech_name, tech_type):
  result = second_page(tech_type)
  for i in result:
    infos = [ptn,tech_name]
    infos.extend(i)
    cursor.execute("insert into {} values ({})".format(department_name,",".join(["?"]*len(infos))),
      tuple(infos))

# 二级页面
def second_page(req):
  soup = BeautifulSoup(req.content, "html5lib")

  # 招聘列表
  info_list = soup.find_all("li", class_ = ["con_list_item", "default_list"])

  # 三级页面链接
  link = [i.find("a", class_="position_link")["href"] for i in info_list]
  # 职位名称
  positions_name = [i.find("h3").text.strip() for i in info_list]
  # 薪水待遇
  money = [i.find("span", class_="money").string for i in info_list]
  # 入职要求
  entry_requirements = [list(i.find("div", class_="li_b_l").stripped_strings)[1] for i in info_list]
  # 公司名称
  company_name = [i.find("div", class_="company_name").text.strip() for i in info_list]
  # 公司领域，发展阶段
  industry = [i.find("div", class_="industry").text.strip() for i in info_list]
  # 职位诱惑
  welfare = [i.find("div", class_="li_b_r").text.strip()[1:-1] for i in info_list]
  # 地址
  address = [i.find("em").text.strip() for i in info_list]

  result = []
  for i in zip(link,positions_name,money,entry_requirements,company_name,industry,welfare,address):
    result.append(i)
  return result


# 找出所有分页
def second_page_pages(link_page):
  link = link_page["href"]
  responses = []
  s = requests.Session()
  for i in range(1,31):
    url = link+"%d/" %i

    if url in running_status:
      continue

    try:
      response = s.get(url,headers=headers,timeout=10)
    except:
      continue

    if response.url != url:
      break
    else:
      responses.append(response)
      running_status.append(url)

  return responses


if __name__ == '__main__':
  # sys.setrecursionlimit(20000)  # 设置最大递归深度为2万
  flood_spide_begin()

运行以上两个程序，两个招聘网站的数据都收集到了。至此，任务结束。

分割线

不急着走，还有第二篇文章，一篇酷酷的文章，分享给大家。

内容是今年中国花切大会的嘉宾之一：Jaspas

他是一位来自新加坡的花切玩家，花切技术也很棒，文章里的两个视频也足以显示他的实力，一睹为快！