本文已参与「新人创作礼」活动,一起开启掘金创作之路。
今天主要是翻到了之前写的一篇爬虫的数据分析案例,将抓取到的51job的招聘数据进行可视化分析,下面先上结果图
职位图
词云图
各城市职位需求占比
网页可视化,学历分析需求图
网页可视化,各城市需求职位数量图
下面直接上代码,由于没有保存数据的地方,代码中的停顿词,原始的招聘数据、词云的模板图等都没有办法保存到网站内,有需要的可以评论留言告诉我,给你发送过去。
from __future__ import print_function
import csv
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import jieba.analyse
from wordcloud import WordCloud
from pylab import mpl
from numpy import *
import codecs
# from tkinter import _flatten
import re
from pyecharts.charts import Bar, Pie, Map
from pyecharts import options as opts
def position_view():
position = []
x_axis = []
y_axis = []
position_dict = {}
with open('51.csv','r') as f:
t = csv.DictReader(f)
for line in t:
position.append(line['position'])
for pos in position:
position_dict[pos] = position.count(pos)
z = zip(position_dict.values(),position_dict.keys())
z = list(sorted(z))
tupz = z[-10:]
for i in range(len(tupz)):
x_axis.append(tupz[i][1])
y_axis.append(tupz[i][0])
f.close()
X = x_axis
Y = y_axis
plt.rcParams['font.sans-serif'] = ['SimHei'] # 正常显示中文汉字
# 绘制柱状图,传入x和y
plt.bar(x_axis, y_axis)
# 设置x轴刻度字体的大小
plt.xticks(fontsize=10, rotation=70)
# 设置y轴刻度字体的大小
plt.yticks(fontsize=10)
# 设置x轴的标签名称
plt.xlabel("职位名称", fontsize=10)
# 设置y轴的标签名称
plt.ylabel("职位热度", fontsize=10)
# 绘制网格
plt.grid(alpha=0.5)
# 展示图片
plt.show()
def processFile():
def positon_request_cloude():
f = open("data.txt")
f1 = open("data1.txt", "a")
line = f.readline()
while line:
line = str(line)
List = line.strip("[").strip("]").replace(r"\xa0", "").replace(r"\xa0", "").replace("_", "").replace("◆","").replace("★", "").replace("■", "")
f1.write(List)
line = f.readline()
f.close()
f1.close()
def positon_request_cloude():
position_request = []
position_request_dict = {}
with open('51.csv','r') as f:
t = csv.DictReader(f)
for line in t:
print(line['position_information'])
file = open('target.txt', 'a')
file.write(str(line['position_information']))
file.close()
#二值化图片,决定生成词云时的图形形状
bg_image_path = "glob.png"
#文本数据
text_path = 'data1.txt'
#字体
font_path = 'simfang.ttf'
#停用词路径
stopwords_path = 'stopword.txt'
def clean_using_stopword(text):
"""
去除停顿词,利用常见停顿词表+自建词库
"""
mywordlist = []
# 用精确模式来分词
seg_list = jieba.cut(text, cut_all=False)
liststr = "/".join(seg_list)
with open(stopwords_path,'rb') as f_stop:
f_stop_text = f_stop.read()
f_stop_text = str(f_stop_text)
f_stop_seg_list = f_stop_text.split('\n')
for myword in liststr.split('/'): # 去除停顿词,生成新文档
if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
mywordlist.append(myword)
return ''.join(mywordlist)
def preprocessing():
"""
文本预处理
直接调用上面定义的函数
内容包括分词和去除停用词
"""
with open(text_path, 'rb') as f:#, encoding='UTF-8'
content = f.read()
print(content)
return clean_using_stopword(content)
def extract_keywords():
# 抽取1000个关键词,带权重,后面需要根据权重来生成词云
allow_pos = ('nr',) # 词性
tags = jieba.analyse.extract_tags(preprocessing(), 100, withWeight=True)
keywords = dict()
for i in tags:
#print("%s---%f" % (i[0], i[1]))
keywords[i[0]] = i[1]
for key in list(keywords.keys()):
if('ns' in key):
keywords.pop(key)
elif('nr'in key):
keywords.pop(key)
return keywords
def draw_wordcloud():
"""
生成词云。
1.配置WordCloud
2.plt进行显示
"""
back_coloring = np.array(Image.open(bg_image_path))
# 设置词云属性
# 设置字体,若是有中文的话,这句代码必须添加,不然会出现方框,不出现汉字
wc = WordCloud(font_path='simfang.ttf',
# 背景颜色
background_color="white",
# 词云显示的最大词数
max_words=2000,
# 设置背景图片
mask=back_coloring,
# 设置有多少种随机生成状态,即有多少种配色方案
random_state=30,
# 输出的画布宽度
width=1000,
# 输出的画布高度
height=500,
)
# 根据频率生成词云
wc.generate_from_frequencies(extract_keywords())
# 显示图片
plt.figure()
# 显示词云图
plt.imshow(wc)
plt.axis("off")
# 打开matplotlib查看器,并显示绘制的图形
plt.show()
# 保存到本地
wc.to_file("wordcloud.jpg")
def processCityData():
cityList = []
citys = []
salary1 = []
salary = []
with open('51.csv', 'r') as f:
t = csv.DictReader(f)
for line in t:
cityList.append(line['region'])
# print(cityList)
for wz in cityList:
wz = str(wz)
citys.append(wz.split("-")[0])
# print(len(citys))
city_dict = {}
for ty in citys:
city_dict[ty] = citys.count(ty)
# print(city_dict)
def processSalary():
with open('51.csv', 'r') as f:
t = csv.DictReader(f)
for line in t:
salary1.append(line['salary'])
for pay in salary1:
pay = str(pay)
if ("千/月" in pay):
names = re.findall(r"(.*?)千/月", pay)
namestd = names[0]
names = str(namestd)
names = names.split("-")[0] + "k" + "-" + names.split("-")[1] + "k"
salary.append(names)
elif ("万/月" in pay):
names = re.findall(r"(.*?)万/月", pay)
namestd = names[0]
names = str(namestd)
names = str(float(names.split("-")[0]) * 10) + "k" + "-" + str(float(names.split("-")[1]) * 10) + "k"
salary.append(names)
elif ("万/年" in pay):
names = re.findall(r"(.*?)万/年", pay)
namestd = names[0]
names = str(namestd)
names = str(round(float(names.split("-")[0]) / 12, 1)) + "k" + "-" + str(
round(float(names.split("-")[1]) / 12, 1)) + "k"
salary.append(names)
else:
salary.append("7k-8k")
return salary
salary = processSalary()
dic = dict(map(lambda x, y: [x, y], citys, salary))
a = []
b = []
for key in city_dict:
a.append(key)
b.append(city_dict[key])
plt.rcParams['font.sans-serif'] = ['SimHei'] # 正常显示中文标签
c = a[0:15]
d = b[0:15]
city = city_dict
label = city.keys()
city_list = a[0:15]
count = 0
n = 1
distance = []
for i in c:
count += 1
if count > 5:
n += 0.1
distance.append(n)
else:
distance.append(0)
plt.pie(d, labels=c, labeldistance=1.2, autopct='%2.1f%%', pctdistance=0.6, shadow=True, explode=distance)
plt.axis('equal') # 使饼图为正圆形
plt.legend(loc='upper left', bbox_to_anchor=(-0.1, 1))
plt.savefig('IT职位需求分布.jpg')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
def base_map(x, y) -> Map:
c = (
Map(init_opts=opts.InitOpts(width = '1200px', height='600px'))
.add(series_name='城市职位需求量', data_pair=list(zip(x, y)), maptype="china", is_selected=True)
.set_global_opts(
title_opts=opts.TitleOpts(title="ECI for cities in China"),
visualmap_opts=opts.VisualMapOpts(is_show=True, min_=min(y), max_=max(y), is_piecewise=True))
)
return c
def drawMap():
cityList = []
citys = []
salary1 = []
salary = []
with open('51.csv', 'r') as f:
t = csv.DictReader(f)
for line in t:
cityList.append(line['region'])
# print(cityList)
for wz in cityList:
wz = str(wz)
citys.append(wz.split("-")[0])
# print(len(citys))
city_dict = {}
for ty in citys:
city_dict[ty] = citys.count(ty)
indexs = []
values = []
for key in city_dict:
indexs.append(key)
values.append(city_dict[key])
# map = Map("主要城市岗位需求数量地图", width=1200, height=600)
# map = Map(init_opts=opts.InitOpts(width='1200px', height='600px'))
# map.add("", indexs, values, maptype='china', is_visualmap=True, visual_text_color='#000')
# map.show_config()
map = base_map(indexs, values)
map.render("./positionResuqst.html")
def paplenumberAnalyse():
educationList = []
education_dict = {}
with open('51.csv', 'r') as f:
t = csv.DictReader(f)
for line in t:
educationList.append(line['education'])
for educate in educationList:
education_dict[educate] = educationList.count(educate)
for item in list(education_dict.keys()):
item = str(item)
if ("招" in item):
del education_dict[item]
print(education_dict)
a = []
b = []
for key in education_dict:
a.append(key)
b.append(education_dict[key])
plt.rcParams['font.sans-serif'] = ['SimHei'] # 正常显示中文标签
c = (
Pie()
.add("", [list(z) for z in zip(a, b)])
.set_colors(["blue", "green", "yellow", "red", "pink", "orange", "purple"])
.set_global_opts(title_opts=opts.TitleOpts(title="职位可视化分析"))
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
.render("pie_set_color.html")
)
#职位热度可视化,只统计热度前十
position_view()
#岗位要求,绘制词云可视化
draw_wordcloud()
#绘制IT行业需求量城市分布,取热度前15
processCityData()
#绘制城市岗位需求图
drawMap()
#根据不同学历的要求人数,分析不同学历职位的占比情况
paplenumberAnalyse()