Boss直聘职位爬虫源代码

569 阅读2分钟

Boss直聘职位爬虫源代码

boss直聘数据分析职位爬虫源代码。

boss的反爬门槛较高,有cookies反爬,一个cookies只能爬取4页的数据,爬完4页后需要重新获取cookies,再更改一下for循环的起始页。

代码中的url需要更改为自己需要的url。

另外,当显示:IndexError: list index out of range 错误时,需要自己将浏览器的cookies复制到header字典中,一个cookies可以爬4页的数据,爬完4页会继续报IndexError: list index out of range错误。

如果有朋友有反反爬方法,欢迎留言评论。

import csv
import requests
from bs4 import BeautifulSoup
from lxml import etree

def open_page(url):
    header = {
        'Cookie': '_bl_uid=ddk63ga1k2edh2g0h39tm2njR0hh; JSESSIONID=""; t=BGgf89C14h4pU8ah; wt=BGgf89C14h4pU8ah; lastCity=100010000; __zp_seo_uuid__=b80b9e7a-97e9-4538-a153-b0cddafa25a6; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1603343711,1603352164,1603848234; __c=1603343708; __l=l=%2Fwww.zhipin.com%2Fc100010000-p100511%2F%3Fpage%3D1%26ka%3Dpage-next&r=https%3A%2F%2Fcn.bing.com%2F&g=&friend_source=0&friend_source=0; __a=59951139.1603343708..1603343708.59.1.59.59; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1603856183; __zp_stoken__=e0a5bfExIBH42THNSczxbYGdzGgQZZ39Kc2UBIitKUV5dIX92SW8NUCFUDlxfICRXPG4SEU8XeHRuJExlKGM4GzpvNjNcLCo4JCMnGRErcCRoVC5cGFkKDkdNDj1HPlMcOj8YAlxsfX1FXFoGPg%3D%3D',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
    }
    res = requests.get(url, headers=header)
    html = res.text
    soup = BeautifulSoup(html, 'lxml')
    html = soup.prettify()

    return html

def data_clear(list):
    for i in range(1, len(list)):
        list[i] = list[i].strip()
    return list

def list_split(list):
    list_1 = [list[i] for i in range(0, len(list), 2) if list[i] != '']
    list_2 = [list[i] for i in range(1, len(list), 2) if list[i] != '']
    return list_1, list_2

def extract_text(html):
    selector = etree.HTML(html)
    titles = selector.xpath('//*[@class="job-name"]/a/@title')
    areas = selector.xpath('//*[@class="job-area"]/text()')
    pubtimes = selector.xpath('//*[@class="job-pub-time"]/text()')
    companies = selector.xpath('//*[@class="name"]/a/text()')
    salaries = selector.xpath('//*[@class="red"]/text()')
    requirements = selector.xpath('//*[@class="job-limit clearfix"]/p/text()')
    industries = selector.xpath('//*[@class="company-text"]/p/a/text()')
    investments = selector.xpath('//*[@class="company-text"]/p/text()')
    tags = selector.xpath('//*[@class="tag-item"]/text()')
    others = selector.xpath('//*[@class="info-desc"]/text()')

    areas = data_clear(areas)
    pubtimes = data_clear(pubtimes)
    companies = data_clear(companies)
    salaries = data_clear(salaries)
    requirements = data_clear(requirements)
    industries = data_clear(industries)
    investments = data_clear(investments)
    tags = data_clear(tags)
    others = data_clear(others)

    requirements_1, requirements_2 = list_split(requirements)
    investments_1, investments_2 = list_split(investments)

    info_dict = {
        'titles': titles,
        'areas': areas,
        'pubtimes': pubtimes,
        'companies': companies,
        'salaries': salaries,
        'requirements_1': requirements_1,
        'requirements_2': requirements_2,
        'industries': industries,
        'investments_1': investments_1,
        'investments_2': investments_2,
        'tags': tags,
        'others': others
    }

    return info_dict

def save_csv(message_list):
    with open('boss.csv', 'a') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([message_list[i] for i in range(0, len(info_dict))])

if __name__ == '__main__':
    for i in range(1, 30):
        url = 'https://www.zhipin.com/c100010000-p100511/?page=' + str(i) + '&ka=page-next'
        html = open_page(url)
        # print(html)
        info_dict = extract_text(html)
        if i == 1:
            fieldnames = [i for i in info_dict.keys()]
            save_csv(fieldnames)
            print('save_csv complete')
        # print(info_dict)
        for j in range(0, len(info_dict)):
            temp = []
            for list in info_dict.values():
                temp.append(list[j])
            save_csv(temp)
        print("已获取到第" + str(i) + "页")

最后,在研究或是对Python感兴趣的小伙伴,可以与我一起交流成长:点击链接加入群聊【python交流群】