51job职位需求可视化分析今天主要是翻到了之前写的一篇爬虫的数据分析案例，将抓取到的51job的招聘数据进行可视化分析

本文已参与「新人创作礼」活动，一起开启掘金创作之路。
今天主要是翻到了之前写的一篇爬虫的数据分析案例，将抓取到的51job的招聘数据进行可视化分析，下面先上结果图
职位图
词云图
各城市职位需求占比
网页可视化，学历分析需求图
网页可视化，各城市需求职位数量图
下面直接上代码，由于没有保存数据的地方，代码中的停顿词，原始的招聘数据、词云的模板图等都没有办法保存到网站内，有需要的可以评论留言告诉我，给你发送过去。
from __future__ import print_function
import csv
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import jieba.analyse
from wordcloud import WordCloud
from pylab import mpl
from numpy import *
import codecs
# from tkinter import _flatten
import re
from pyecharts.charts import Bar, Pie, Map
from pyecharts import options as opts
def position_view():
    position = []
    x_axis = []
    y_axis = []
    position_dict = {}
    with open('51.csv','r') as f:
         t = csv.DictReader(f)
         for line in t:
            position.append(line['position'])
    for pos in position:
        position_dict[pos] = position.count(pos)
    z = zip(position_dict.values(),position_dict.keys())
    z = list(sorted(z))
    tupz = z[-10:]
    for i in range(len(tupz)):
        x_axis.append(tupz[i][1])
        y_axis.append(tupz[i][0])
    f.close()
    X = x_axis
    Y = y_axis
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 正常显示中文汉字

    # 绘制柱状图，传入x和y
    plt.bar(x_axis, y_axis)
    # 设置x轴刻度字体的大小
    plt.xticks(fontsize=10, rotation=70)
    # 设置y轴刻度字体的大小
    plt.yticks(fontsize=10)
    # 设置x轴的标签名称
    plt.xlabel("职位名称", fontsize=10)
    # 设置y轴的标签名称
    plt.ylabel("职位热度", fontsize=10)
    # 绘制网格
    plt.grid(alpha=0.5)
    # 展示图片
    plt.show()
def processFile():
    def positon_request_cloude():
        f = open("data.txt")
        f1 = open("data1.txt", "a")
        line = f.readline()
        while line:
            line = str(line)
            List = line.strip("[").strip("]").replace(r"\xa0", "").replace(r"\xa0", "").replace("_", "").replace("◆","").replace("★", "").replace("■", "")
            f1.write(List)
            line = f.readline()
        f.close()
        f1.close()
def positon_request_cloude():
    position_request = []
    position_request_dict = {}
    with open('51.csv','r') as f:
        t = csv.DictReader(f)
        for line in t:
            print(line['position_information'])
            file = open('target.txt', 'a')
            file.write(str(line['position_information']))
            file.close()

#二值化图片，决定生成词云时的图形形状
bg_image_path = "glob.png"
#文本数据
text_path = 'data1.txt'
#字体
font_path = 'simfang.ttf'
#停用词路径
stopwords_path = 'stopword.txt'

def clean_using_stopword(text):
    """
    去除停顿词，利用常见停顿词表+自建词库
    """
    mywordlist = []
    # 用精确模式来分词
    seg_list = jieba.cut(text, cut_all=False)
    liststr = "/".join(seg_list)
    with open(stopwords_path,'rb') as f_stop:
        f_stop_text = f_stop.read()
        f_stop_text = str(f_stop_text)
    f_stop_seg_list = f_stop_text.split('\n')
    for myword in liststr.split('/'):  # 去除停顿词，生成新文档
        if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
            mywordlist.append(myword)
    return ''.join(mywordlist)


def preprocessing():
    """
    文本预处理
    直接调用上面定义的函数
    内容包括分词和去除停用词
    """
    with open(text_path, 'rb') as f:#, encoding='UTF-8'
        content = f.read()
    print(content)
    return clean_using_stopword(content)


def extract_keywords():
    # 抽取1000个关键词，带权重，后面需要根据权重来生成词云
    allow_pos = ('nr',)  # 词性
    tags = jieba.analyse.extract_tags(preprocessing(), 100, withWeight=True)
    keywords = dict()
    for i in tags:
        #print("%s---%f" % (i[0], i[1]))
        keywords[i[0]] = i[1]
    for key in list(keywords.keys()):
        if('ns' in key):
            keywords.pop(key)
        elif('nr'in key):
            keywords.pop(key)
    return keywords


def draw_wordcloud():
    """
    生成词云。
    1.配置WordCloud
    2.plt进行显示
    """
    back_coloring = np.array(Image.open(bg_image_path))
    # 设置词云属性
    # 设置字体,若是有中文的话，这句代码必须添加，不然会出现方框，不出现汉字
    wc = WordCloud(font_path='simfang.ttf',
                   # 背景颜色
                   background_color="white",
                   # 词云显示的最大词数
                   max_words=2000,
                   # 设置背景图片
                   mask=back_coloring,
                   # 设置有多少种随机生成状态，即有多少种配色方案
                   random_state=30,
                   # 输出的画布宽度
                   width=1000,
                   # 输出的画布高度
                   height=500,
                   )

    # 根据频率生成词云
    wc.generate_from_frequencies(extract_keywords())
    # 显示图片
    plt.figure()
    # 显示词云图
    plt.imshow(wc)
    plt.axis("off")
    # 打开matplotlib查看器，并显示绘制的图形
    plt.show()
    # 保存到本地
    wc.to_file("wordcloud.jpg")


def processCityData():
    cityList = []
    citys = []
    salary1 = []
    salary = []
    with open('51.csv', 'r') as f:
        t = csv.DictReader(f)
        for line in t:
            cityList.append(line['region'])
    # print(cityList)
    for wz in cityList:
        wz = str(wz)
        citys.append(wz.split("-")[0])
    # print(len(citys))
    city_dict = {}
    for ty in citys:
        city_dict[ty] = citys.count(ty)
    # print(city_dict)


    def processSalary():
        with open('51.csv', 'r') as f:
            t = csv.DictReader(f)
            for line in t:
                salary1.append(line['salary'])
        for pay in salary1:
            pay = str(pay)
            if ("千/月" in pay):
                names = re.findall(r"(.*?)千/月", pay)
                namestd = names[0]
                names = str(namestd)
                names = names.split("-")[0] + "k" + "-" + names.split("-")[1] + "k"
                salary.append(names)
            elif ("万/月" in pay):
                names = re.findall(r"(.*?)万/月", pay)
                namestd = names[0]
                names = str(namestd)
                names = str(float(names.split("-")[0]) * 10) + "k" + "-" + str(float(names.split("-")[1]) * 10) + "k"
                salary.append(names)
            elif ("万/年" in pay):
                names = re.findall(r"(.*?)万/年", pay)
                namestd = names[0]
                names = str(namestd)
                names = str(round(float(names.split("-")[0]) / 12, 1)) + "k" + "-" + str(
                    round(float(names.split("-")[1]) / 12, 1)) + "k"
                salary.append(names)
            else:
                salary.append("7k-8k")
        return salary

    salary = processSalary()
    dic = dict(map(lambda x, y: [x, y], citys, salary))

    a = []
    b = []
    for key in city_dict:
        a.append(key)
        b.append(city_dict[key])
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 正常显示中文标签

    c = a[0:15]
    d = b[0:15]

    city = city_dict
    label = city.keys()
    city_list = a[0:15]
    count = 0
    n = 1
    distance = []
    for i in c:
        count += 1
        if count > 5:
            n += 0.1
            distance.append(n)
        else:
            distance.append(0)
    plt.pie(d, labels=c, labeldistance=1.2, autopct='%2.1f%%', pctdistance=0.6, shadow=True, explode=distance)
    plt.axis('equal')  # 使饼图为正圆形
    plt.legend(loc='upper left', bbox_to_anchor=(-0.1, 1))
    plt.savefig('IT职位需求分布.jpg')
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()


def base_map(x, y) -> Map:
    c = (
        Map(init_opts=opts.InitOpts(width = '1200px', height='600px'))
        .add(series_name='城市职位需求量', data_pair=list(zip(x, y)), maptype="china", is_selected=True)
        .set_global_opts(
            title_opts=opts.TitleOpts(title="ECI for cities in China"),
            visualmap_opts=opts.VisualMapOpts(is_show=True, min_=min(y), max_=max(y), is_piecewise=True))
    )
    return c


def drawMap():
    cityList = []
    citys = []
    salary1 = []
    salary = []
    with open('51.csv', 'r') as f:
        t = csv.DictReader(f)
        for line in t:
            cityList.append(line['region'])
    # print(cityList)
    for wz in cityList:
        wz = str(wz)
        citys.append(wz.split("-")[0])
    # print(len(citys))
    city_dict = {}
    for ty in citys:
        city_dict[ty] = citys.count(ty)

    indexs = []
    values = []
    for key in city_dict:
        indexs.append(key)
        values.append(city_dict[key])
    # map = Map("主要城市岗位需求数量地图", width=1200, height=600)
    # map = Map(init_opts=opts.InitOpts(width='1200px', height='600px'))
    # map.add("", indexs, values, maptype='china', is_visualmap=True, visual_text_color='#000')
    # map.show_config()
    map = base_map(indexs, values)
    map.render("./positionResuqst.html")


def paplenumberAnalyse():
    educationList = []
    education_dict = {}

    with open('51.csv', 'r') as f:
        t = csv.DictReader(f)
        for line in t:
            educationList.append(line['education'])
    for educate in educationList:
        education_dict[educate] = educationList.count(educate)

    for item in list(education_dict.keys()):
        item = str(item)
        if ("招" in item):
            del education_dict[item]

    print(education_dict)
    a = []
    b = []
    for key in education_dict:
        a.append(key)
        b.append(education_dict[key])
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 正常显示中文标签
    c = (
        Pie()
            .add("", [list(z) for z in zip(a, b)])
            .set_colors(["blue", "green", "yellow", "red", "pink", "orange", "purple"])
            .set_global_opts(title_opts=opts.TitleOpts(title="职位可视化分析"))
            .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
            .render("pie_set_color.html")
    )


#职位热度可视化，只统计热度前十
position_view()
#岗位要求，绘制词云可视化
draw_wordcloud()
#绘制IT行业需求量城市分布，取热度前15
processCityData()
#绘制城市岗位需求图
drawMap()
#根据不同学历的要求人数，分析不同学历职位的占比情况
paplenumberAnalyse()