51job职位需求可视化分析

291 阅读4分钟

本文已参与「新人创作礼」活动,一起开启掘金创作之路。

今天主要是翻到了之前写的一篇爬虫的数据分析案例,将抓取到的51job的招聘数据进行可视化分析,下面先上结果图

职位图

1.PNG

词云图

2.PNG

各城市职位需求占比

3.PNG

网页可视化,学历分析需求图

4.PNG

网页可视化,各城市需求职位数量图 6.PNG

下面直接上代码,由于没有保存数据的地方,代码中的停顿词,原始的招聘数据、词云的模板图等都没有办法保存到网站内,有需要的可以评论留言告诉我,给你发送过去。

from __future__ import print_function
import csv
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import jieba.analyse
from wordcloud import WordCloud
from pylab import mpl
from numpy import *
import codecs
# from tkinter import _flatten
import re
from pyecharts.charts import Bar, Pie, Map
from pyecharts import options as opts
def position_view():
    position = []
    x_axis = []
    y_axis = []
    position_dict = {}
    with open('51.csv','r') as f:
         t = csv.DictReader(f)
         for line in t:
            position.append(line['position'])
    for pos in position:
        position_dict[pos] = position.count(pos)
    z = zip(position_dict.values(),position_dict.keys())
    z = list(sorted(z))
    tupz = z[-10:]
    for i in range(len(tupz)):
        x_axis.append(tupz[i][1])
        y_axis.append(tupz[i][0])
    f.close()
    X = x_axis
    Y = y_axis
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 正常显示中文汉字

    # 绘制柱状图,传入x和y
    plt.bar(x_axis, y_axis)
    # 设置x轴刻度字体的大小
    plt.xticks(fontsize=10, rotation=70)
    # 设置y轴刻度字体的大小
    plt.yticks(fontsize=10)
    # 设置x轴的标签名称
    plt.xlabel("职位名称", fontsize=10)
    # 设置y轴的标签名称
    plt.ylabel("职位热度", fontsize=10)
    # 绘制网格
    plt.grid(alpha=0.5)
    # 展示图片
    plt.show()
def processFile():
    def positon_request_cloude():
        f = open("data.txt")
        f1 = open("data1.txt", "a")
        line = f.readline()
        while line:
            line = str(line)
            List = line.strip("[").strip("]").replace(r"\xa0", "").replace(r"\xa0", "").replace("_", "").replace("◆","").replace("★", "").replace("■", "")
            f1.write(List)
            line = f.readline()
        f.close()
        f1.close()
def positon_request_cloude():
    position_request = []
    position_request_dict = {}
    with open('51.csv','r') as f:
        t = csv.DictReader(f)
        for line in t:
            print(line['position_information'])
            file = open('target.txt', 'a')
            file.write(str(line['position_information']))
            file.close()

#二值化图片,决定生成词云时的图形形状
bg_image_path = "glob.png"
#文本数据
text_path = 'data1.txt'
#字体
font_path = 'simfang.ttf'
#停用词路径
stopwords_path = 'stopword.txt'

def clean_using_stopword(text):
    """
    去除停顿词,利用常见停顿词表+自建词库
    """
    mywordlist = []
    # 用精确模式来分词
    seg_list = jieba.cut(text, cut_all=False)
    liststr = "/".join(seg_list)
    with open(stopwords_path,'rb') as f_stop:
        f_stop_text = f_stop.read()
        f_stop_text = str(f_stop_text)
    f_stop_seg_list = f_stop_text.split('\n')
    for myword in liststr.split('/'):  # 去除停顿词,生成新文档
        if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
            mywordlist.append(myword)
    return ''.join(mywordlist)


def preprocessing():
    """
    文本预处理
    直接调用上面定义的函数
    内容包括分词和去除停用词
    """
    with open(text_path, 'rb') as f:#, encoding='UTF-8'
        content = f.read()
    print(content)
    return clean_using_stopword(content)


def extract_keywords():
    # 抽取1000个关键词,带权重,后面需要根据权重来生成词云
    allow_pos = ('nr',)  # 词性
    tags = jieba.analyse.extract_tags(preprocessing(), 100, withWeight=True)
    keywords = dict()
    for i in tags:
        #print("%s---%f" % (i[0], i[1]))
        keywords[i[0]] = i[1]
    for key in list(keywords.keys()):
        if('ns' in key):
            keywords.pop(key)
        elif('nr'in key):
            keywords.pop(key)
    return keywords


def draw_wordcloud():
    """
    生成词云。
    1.配置WordCloud
    2.plt进行显示
    """
    back_coloring = np.array(Image.open(bg_image_path))
    # 设置词云属性
    # 设置字体,若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字
    wc = WordCloud(font_path='simfang.ttf',
                   # 背景颜色
                   background_color="white",
                   # 词云显示的最大词数
                   max_words=2000,
                   # 设置背景图片
                   mask=back_coloring,
                   # 设置有多少种随机生成状态,即有多少种配色方案
                   random_state=30,
                   # 输出的画布宽度
                   width=1000,
                   # 输出的画布高度
                   height=500,
                   )

    # 根据频率生成词云
    wc.generate_from_frequencies(extract_keywords())
    # 显示图片
    plt.figure()
    # 显示词云图
    plt.imshow(wc)
    plt.axis("off")
    # 打开matplotlib查看器,并显示绘制的图形
    plt.show()
    # 保存到本地
    wc.to_file("wordcloud.jpg")


def processCityData():
    cityList = []
    citys = []
    salary1 = []
    salary = []
    with open('51.csv', 'r') as f:
        t = csv.DictReader(f)
        for line in t:
            cityList.append(line['region'])
    # print(cityList)
    for wz in cityList:
        wz = str(wz)
        citys.append(wz.split("-")[0])
    # print(len(citys))
    city_dict = {}
    for ty in citys:
        city_dict[ty] = citys.count(ty)
    # print(city_dict)


    def processSalary():
        with open('51.csv', 'r') as f:
            t = csv.DictReader(f)
            for line in t:
                salary1.append(line['salary'])
        for pay in salary1:
            pay = str(pay)
            if ("千/月" in pay):
                names = re.findall(r"(.*?)千/月", pay)
                namestd = names[0]
                names = str(namestd)
                names = names.split("-")[0] + "k" + "-" + names.split("-")[1] + "k"
                salary.append(names)
            elif ("万/月" in pay):
                names = re.findall(r"(.*?)万/月", pay)
                namestd = names[0]
                names = str(namestd)
                names = str(float(names.split("-")[0]) * 10) + "k" + "-" + str(float(names.split("-")[1]) * 10) + "k"
                salary.append(names)
            elif ("万/年" in pay):
                names = re.findall(r"(.*?)万/年", pay)
                namestd = names[0]
                names = str(namestd)
                names = str(round(float(names.split("-")[0]) / 12, 1)) + "k" + "-" + str(
                    round(float(names.split("-")[1]) / 12, 1)) + "k"
                salary.append(names)
            else:
                salary.append("7k-8k")
        return salary

    salary = processSalary()
    dic = dict(map(lambda x, y: [x, y], citys, salary))

    a = []
    b = []
    for key in city_dict:
        a.append(key)
        b.append(city_dict[key])
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 正常显示中文标签

    c = a[0:15]
    d = b[0:15]

    city = city_dict
    label = city.keys()
    city_list = a[0:15]
    count = 0
    n = 1
    distance = []
    for i in c:
        count += 1
        if count > 5:
            n += 0.1
            distance.append(n)
        else:
            distance.append(0)
    plt.pie(d, labels=c, labeldistance=1.2, autopct='%2.1f%%', pctdistance=0.6, shadow=True, explode=distance)
    plt.axis('equal')  # 使饼图为正圆形
    plt.legend(loc='upper left', bbox_to_anchor=(-0.1, 1))
    plt.savefig('IT职位需求分布.jpg')
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()


def base_map(x, y) -> Map:
    c = (
        Map(init_opts=opts.InitOpts(width = '1200px', height='600px'))
        .add(series_name='城市职位需求量', data_pair=list(zip(x, y)), maptype="china", is_selected=True)
        .set_global_opts(
            title_opts=opts.TitleOpts(title="ECI for cities in China"),
            visualmap_opts=opts.VisualMapOpts(is_show=True, min_=min(y), max_=max(y), is_piecewise=True))
    )
    return c


def drawMap():
    cityList = []
    citys = []
    salary1 = []
    salary = []
    with open('51.csv', 'r') as f:
        t = csv.DictReader(f)
        for line in t:
            cityList.append(line['region'])
    # print(cityList)
    for wz in cityList:
        wz = str(wz)
        citys.append(wz.split("-")[0])
    # print(len(citys))
    city_dict = {}
    for ty in citys:
        city_dict[ty] = citys.count(ty)

    indexs = []
    values = []
    for key in city_dict:
        indexs.append(key)
        values.append(city_dict[key])
    # map = Map("主要城市岗位需求数量地图", width=1200, height=600)
    # map = Map(init_opts=opts.InitOpts(width='1200px', height='600px'))
    # map.add("", indexs, values, maptype='china', is_visualmap=True, visual_text_color='#000')
    # map.show_config()
    map = base_map(indexs, values)
    map.render("./positionResuqst.html")


def paplenumberAnalyse():
    educationList = []
    education_dict = {}

    with open('51.csv', 'r') as f:
        t = csv.DictReader(f)
        for line in t:
            educationList.append(line['education'])
    for educate in educationList:
        education_dict[educate] = educationList.count(educate)

    for item in list(education_dict.keys()):
        item = str(item)
        if ("招" in item):
            del education_dict[item]

    print(education_dict)
    a = []
    b = []
    for key in education_dict:
        a.append(key)
        b.append(education_dict[key])
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 正常显示中文标签
    c = (
        Pie()
            .add("", [list(z) for z in zip(a, b)])
            .set_colors(["blue", "green", "yellow", "red", "pink", "orange", "purple"])
            .set_global_opts(title_opts=opts.TitleOpts(title="职位可视化分析"))
            .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
            .render("pie_set_color.html")
    )


#职位热度可视化,只统计热度前十
position_view()
#岗位要求,绘制词云可视化
draw_wordcloud()
#绘制IT行业需求量城市分布,取热度前15
processCityData()
#绘制城市岗位需求图
drawMap()
#根据不同学历的要求人数,分析不同学历职位的占比情况
paplenumberAnalyse()