前言
前段时间一直在做web开发,突然想做回老本行,想做一下爬虫和数据分析,于是想到了招聘信息(可能愁什么就想到什么),毕业生应聘是一个大难题,所以希望能提供给大家一些清晰明了的信息,帮助大家找到好工作。
这篇文章主要讲解数据收集,后续会做出数据分析并分享给大家。文末有好东西介绍
一、确定目标
网上有很多招聘信息网,例如拉勾,58,智联等等,这次爬虫爬取的数据来源于拉勾网和智联两个网站,这两个网站分类都比较明确而且信息也比较多
二、战前准备(运行环境和库)
-
python3.5
-
pymongo(可用其他数据库替代)
-
multiprocessing
-
requests
-
BeautifulSoup
-
itertools
三、实战
下面的内容分为两部分,分别是爬取智联招聘网和拉勾网的代码
1.智联招聘
首先说一下爬取的步骤:
-
根据关键字、城市以及页面编号生成需要爬取的网页链接
-
获取相应的网页内容
-
解析网页内容,获取需要的信息
-
将爬取的信息保存到MongoDB(SQLite)数据库中
-
用multiprocessing启动多进程进行爬取,极大提高运行效率
爬虫主运行文件 "zhilian_spider.py"
# _*_ coding: utf-8 _*_
# Author: "DHC_King"
# Python 3.x
### 配置信息 ###
TOTAL_PAGE_NUMBER = 90 # PAGE_NUMBER: total number of pages,可进行修改
KEYWORDS = ['java','艺术','设计'] # 需爬取的关键字可以自己添加或修改
# 爬取主要城市的记录
ADDRESS = ['全国', '北京', '上海', '广州', '深圳',
'天津', '武汉', '西安', '成都', '大连',
'长春', '沈阳', '南京', '济南', '青岛',
'杭州', '苏州', '无锡', '宁波', '重庆',
'郑州', '长沙', '福州', '厦门', '哈尔滨',
'石家庄', '合肥', '惠州', '太原', '昆明',
'烟台', '佛山', '南昌', '贵阳', '南宁']
MONGO_URI = 'localhost'
MONGO_DB = 'zhilian'
###——————###
###主程序###
from datetime import datetime
from urllib.parse import urlencode
from multiprocessing import Pool
import requests
from bs4 import BeautifulSoup
import pymongo
from zhilian_kw_config import *
import time
from itertools import product
client = pymongo.MongoClient(MONGO_URI)
db = client[MONGO_DB]
def download(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'}
response = requests.get(url, headers=headers)
return response.text
def get_content(html):
# 记录保存日期
date = datetime.now().date()
date = datetime.strftime(date, '%Y-%m-%d') # 转变成str
soup = BeautifulSoup(html, 'lxml')
body = soup.body
data_main = body.find('div', {'class': 'newlist_list_content'})
if data_main:
tables = data_main.find_all('table')
for i, table_info in enumerate(tables):
if i == 0:
continue
tds = table_info.find('tr').find_all('td')
zwmc = tds[0].find('a').get_text() # 职位名称
zw_link = tds[0].find('a').get('href') # 职位链接
fkl = tds[1].find('span').get_text() # 反馈率
gsmc = tds[2].find('a').get_text() # 公司名称
zwyx = tds[3].get_text() # 职位月薪
gzdd = tds[4].get_text() # 工作地点
gbsj = tds[5].find('span').get_text() # 发布日期
tr_brief = table_info.find('tr', {'class': 'newlist_tr_detail'})
# 招聘简介
brief = tr_brief.find('li', {'class': 'newlist_deatil_last'}).get_text()
# 用生成器获取信息
yield {'zwmc': zwmc, # 职位名称
'fkl': fkl, # 反馈率
'gsmc': gsmc, # 公司名称
'zwyx': zwyx, # 职位月薪
'gzdd': gzdd, # 工作地点
'gbsj': gbsj, # 公布时间
'brief': brief, # 招聘简介
'zw_link': zw_link, # 网页链接
'save_date': date # 记录信息保存的日期
}
def main(args):
basic_url = '招聘(求职)尽在智联招聘?' #自己修改
for keyword in KEYWORDS:
mongo_table = db[keyword]
paras = {'jl': args[0],
'kw': keyword,
'p': args[1] # 第X页
}
url = basic_url + urlencode(paras)
# print(url)
html = download(url)
# print(html)
if html:
data = get_content(html)
for item in data:
if mongo_table.update({'zw_link': item['zw_link']}, {'$set': item}, True):
print("爬取完成: ",keyword,item["zwmc"])
if __name__ == '__main__':
start = time.time()
number_list = list(range(TOTAL_PAGE_NUMBER))
args = product(ADDRESS, number_list)
pool = Pool()
pool.map(main, args) # 多进程运行
end = time.time()
print('Finished, task runs %s seconds.' % (end - start))
感谢以上代码的主要贡献者:lemon
2.拉勾网
直接上代码(Python2.7,使用SQLite数据库 )
import requests
from bs4 import BeautifulSoup
import sqlite3
import datetime
import json
import time
import random
import sys
import os
this_year = datetime.datetime.now().year
# 记录已经爬取的网页,保存为json文件
try:
with open("%dparsed.json" %this_year) as f:
running_status = json.load(f)
except:
running_status = []
conn = sqlite3.connect("%d.db" %this_year)
cursor = conn.cursor()
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
"Cookie":"user_trace_token=20170713202208-e4038337-67c5-11e7-ba4d-525400f775ce; LGUID=20170713202208-e4038a60-67c5-11e7-ba4d-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=ABAAABAACEBACDG1BC39A67EE4674BEA5CC657D85B07E9F; TG-TRACK-CODE=index_navigation; X_MIDDLE_TOKEN=493e8c86da05060187202c1ab73933d2; X_HTTP_TOKEN=9e18de54125fdedc23f44e90ef2444c8; SEARCH_ID=81783bf1b1c24554b8a0c09547904c64; _gid=GA1.2.1242554573.1499919725; _gat=1; _ga=GA1.2.1892926406.1499919725; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1499921082,1500005352,1500058199,1500068071; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1500071706; LGSID=20170715133435-4958d6c9-691f-11e7-8990-525400f775ce; LGRID=20170715143510-c01adf2d-6927-11e7-a985-5254005c3644"}
# 洪流爬虫
def flood_spide_begin():
URL = "拉勾网"
response = requests.get(URL)
soup = BeautifulSoup(response.content, "html5lib")
# 左侧主菜单栏
tag = soup.find("div", class_ = "mainNavs")
# 部门类型
menu_boxs = tag.find_all("div", class_ = "menu_box")
positions_frame = [(i.find("div", class_ = "menu_main").h2.text.strip(),\
i.find("div", class_ = "menu_sub")) for i in menu_boxs] # 一个框里面的信息
department_type = dict(positions_frame)
try:
for i in department_type:
try:
cursor.execute("create table %s (ptn,tech_name,link,zwmc,yx,rzyq,gsmc,fzjd,zwyh,dz)" %i.encode("utf-8"))
except Exception as e:
print e
pass
menu_sub = department_type[i]
# 分开写
list_profession(menu_sub, i.encode("utf-8"))
finally:
cursor.close()
conn.commit()
conn.close()
def list_profession(menu_sub, department_name):
# 职业类型:key
profession_type = menu_sub.find_all("dl")
profession_type_name = (n.find("span").string.strip() for n in profession_type)
# 技术类型
technology = (n.find_all("a") for n in profession_type)
for ptn,tec in zip(profession_type_name,technology):
list_technology(ptn,tec, department_name)
def list_technology(ptn,tec, department_name):
# 技术名
technology_name = (n.text.strip() for n in tec)
middleware = []
all_pages = map(second_page_pages, tec) # 分页
for i in all_pages:
middleware.extend(i)
all_pages = middleware[:]
del middleware
print ptn
for tech_name, tech_type in zip(technology_name,all_pages):
list_result(department_name,ptn,tech_name, tech_type)
# 结果
def list_result(department_name, ptn, tech_name, tech_type):
result = second_page(tech_type)
for i in result:
infos = [ptn,tech_name]
infos.extend(i)
cursor.execute("insert into {} values ({})".format(department_name,",".join(["?"]*len(infos))),
tuple(infos))
# 二级页面
def second_page(req):
soup = BeautifulSoup(req.content, "html5lib")
# 招聘列表
info_list = soup.find_all("li", class_ = ["con_list_item", "default_list"])
# 三级页面链接
link = [i.find("a", class_="position_link")["href"] for i in info_list]
# 职位名称
positions_name = [i.find("h3").text.strip() for i in info_list]
# 薪水待遇
money = [i.find("span", class_="money").string for i in info_list]
# 入职要求
entry_requirements = [list(i.find("div", class_="li_b_l").stripped_strings)[1] for i in info_list]
# 公司名称
company_name = [i.find("div", class_="company_name").text.strip() for i in info_list]
# 公司领域,发展阶段
industry = [i.find("div", class_="industry").text.strip() for i in info_list]
# 职位诱惑
welfare = [i.find("div", class_="li_b_r").text.strip()[1:-1] for i in info_list]
# 地址
address = [i.find("em").text.strip() for i in info_list]
result = []
for i in zip(link,positions_name,money,entry_requirements,company_name,industry,welfare,address):
result.append(i)
return result
# 找出所有分页
def second_page_pages(link_page):
link = link_page["href"]
responses = []
s = requests.Session()
for i in range(1,31):
url = link+"%d/" %i
if url in running_status:
continue
try:
response = s.get(url,headers=headers,timeout=10)
except:
continue
if response.url != url:
break
else:
responses.append(response)
running_status.append(url)
return responses
if __name__ == '__main__':
# sys.setrecursionlimit(20000) # 设置最大递归深度为2万
flood_spide_begin()
运行以上两个程序,两个招聘网站的数据都收集到了。至此,任务结束。

不急着走,还有第二篇文章,一篇酷酷的文章,分享给大家。
内容是今年中国花切大会的嘉宾之一:Jaspas
他是一位来自新加坡的花切玩家,花切技术也很棒,文章里的两个视频也足以显示他的实力,一睹为快!