Boss直聘职位爬虫源代码
boss直聘数据分析职位爬虫源代码。
boss的反爬门槛较高,有cookies反爬,一个cookies只能爬取4页的数据,爬完4页后需要重新获取cookies,再更改一下for循环的起始页。
代码中的url需要更改为自己需要的url。
另外,当显示:IndexError: list index out of range 错误时,需要自己将浏览器的cookies复制到header字典中,一个cookies可以爬4页的数据,爬完4页会继续报IndexError: list index out of range错误。
如果有朋友有反反爬方法,欢迎留言评论。
import csv
import requests
from bs4 import BeautifulSoup
from lxml import etree
def open_page(url):
header = {
'Cookie': '_bl_uid=ddk63ga1k2edh2g0h39tm2njR0hh; JSESSIONID=""; t=BGgf89C14h4pU8ah; wt=BGgf89C14h4pU8ah; lastCity=100010000; __zp_seo_uuid__=b80b9e7a-97e9-4538-a153-b0cddafa25a6; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1603343711,1603352164,1603848234; __c=1603343708; __l=l=%2Fwww.zhipin.com%2Fc100010000-p100511%2F%3Fpage%3D1%26ka%3Dpage-next&r=https%3A%2F%2Fcn.bing.com%2F&g=&friend_source=0&friend_source=0; __a=59951139.1603343708..1603343708.59.1.59.59; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1603856183; __zp_stoken__=e0a5bfExIBH42THNSczxbYGdzGgQZZ39Kc2UBIitKUV5dIX92SW8NUCFUDlxfICRXPG4SEU8XeHRuJExlKGM4GzpvNjNcLCo4JCMnGRErcCRoVC5cGFkKDkdNDj1HPlMcOj8YAlxsfX1FXFoGPg%3D%3D',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
}
res = requests.get(url, headers=header)
html = res.text
soup = BeautifulSoup(html, 'lxml')
html = soup.prettify()
return html
def data_clear(list):
for i in range(1, len(list)):
list[i] = list[i].strip()
return list
def list_split(list):
list_1 = [list[i] for i in range(0, len(list), 2) if list[i] != '']
list_2 = [list[i] for i in range(1, len(list), 2) if list[i] != '']
return list_1, list_2
def extract_text(html):
selector = etree.HTML(html)
titles = selector.xpath('//*[@class="job-name"]/a/@title')
areas = selector.xpath('//*[@class="job-area"]/text()')
pubtimes = selector.xpath('//*[@class="job-pub-time"]/text()')
companies = selector.xpath('//*[@class="name"]/a/text()')
salaries = selector.xpath('//*[@class="red"]/text()')
requirements = selector.xpath('//*[@class="job-limit clearfix"]/p/text()')
industries = selector.xpath('//*[@class="company-text"]/p/a/text()')
investments = selector.xpath('//*[@class="company-text"]/p/text()')
tags = selector.xpath('//*[@class="tag-item"]/text()')
others = selector.xpath('//*[@class="info-desc"]/text()')
areas = data_clear(areas)
pubtimes = data_clear(pubtimes)
companies = data_clear(companies)
salaries = data_clear(salaries)
requirements = data_clear(requirements)
industries = data_clear(industries)
investments = data_clear(investments)
tags = data_clear(tags)
others = data_clear(others)
requirements_1, requirements_2 = list_split(requirements)
investments_1, investments_2 = list_split(investments)
info_dict = {
'titles': titles,
'areas': areas,
'pubtimes': pubtimes,
'companies': companies,
'salaries': salaries,
'requirements_1': requirements_1,
'requirements_2': requirements_2,
'industries': industries,
'investments_1': investments_1,
'investments_2': investments_2,
'tags': tags,
'others': others
}
return info_dict
def save_csv(message_list):
with open('boss.csv', 'a') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([message_list[i] for i in range(0, len(info_dict))])
if __name__ == '__main__':
for i in range(1, 30):
url = 'https://www.zhipin.com/c100010000-p100511/?page=' + str(i) + '&ka=page-next'
html = open_page(url)
# print(html)
info_dict = extract_text(html)
if i == 1:
fieldnames = [i for i in info_dict.keys()]
save_csv(fieldnames)
print('save_csv complete')
# print(info_dict)
for j in range(0, len(info_dict)):
temp = []
for list in info_dict.values():
temp.append(list[j])
save_csv(temp)
print("已获取到第" + str(i) + "页")
最后,在研究或是对Python感兴趣的小伙伴,可以与我一起交流成长:点击链接加入群聊【python交流群】