一、个人简介
💖💖作者:计算机编程果茶熊 💙💙个人简介:曾长期从事计算机专业培训教学,担任过编程老师,同时本人也热爱上课教学,擅长Java、微信小程序、Python、Golang、安卓Android等多个IT方向。会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 计算机毕业设计选题 💕💕文末获取源码联系计算机编程果茶熊
二、系统介绍
大数据框架:Hadoop+Spark(Hive需要定制修改) 开发语言:Java+Python(两个版本都支持) 数据库:MySQL 后端框架:SpringBoot(Spring+SpringMVC+Mybatis)+Django(两个版本都支持) 前端:Vue+Echarts+HTML+CSS+JavaScript+jQuery
《BOSS直聘岗位招聘数据可视化分析系统》是一套基于大数据技术栈的招聘数据分析平台,采用Hadoop+Spark作为核心大数据处理框架,结合Python和Java双语言开发模式,构建了完整的数据采集、处理、分析和可视化展示体系。系统后端采用Django框架,前端基于Vue+ElementUI+Echarts技术栈打造现代化用户界面,通过MySQL数据库进行数据存储管理。系统核心功能涵盖城市招聘分析、行业规模分析、技能要求分析、岗位画像分析以及大屏可视化展示等模块,能够对BOSS直聘平台的海量招聘数据进行深度挖掘和多维度分析。通过Spark SQL进行高效的数据查询处理,结合Pandas和NumPy进行数据科学计算,最终通过Echarts图表库将分析结果以直观的可视化形式呈现,为求职者、企业HR和相关研究人员提供有价值的招聘市场洞察和决策支持。
三、视频解说
四、部分功能展示
五、部分代码展示
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import numpy as np
spark = SparkSession.builder.appName("BOSSRecruitmentAnalysis").config("spark.sql.adaptive.enabled", "true").config("spark.sql.adaptive.coalescePartitions.enabled", "true").getOrCreate()
def city_recruitment_analysis():
recruitment_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/recruitment").option("dbtable", "job_info").option("user", "root").option("password", "password").load()
city_stats = recruitment_df.groupBy("city").agg(count("job_id").alias("job_count"), avg("salary_min").alias("avg_min_salary"), avg("salary_max").alias("avg_max_salary"), countDistinct("company_name").alias("company_count"))
city_demand_trend = recruitment_df.withColumn("month", date_format("publish_time", "yyyy-MM")).groupBy("city", "month").agg(count("job_id").alias("monthly_jobs"))
city_salary_distribution = recruitment_df.select("city", "salary_min", "salary_max").withColumn("salary_range", when(col("salary_max") <= 8000, "low").when((col("salary_max") > 8000) & (col("salary_max") <= 15000), "medium").otherwise("high"))
salary_range_stats = city_salary_distribution.groupBy("city", "salary_range").count()
hot_cities = city_stats.orderBy(desc("job_count")).limit(10)
city_growth_rate = city_demand_trend.withColumn("prev_month_jobs", lag("monthly_jobs").over(Window.partitionBy("city").orderBy("month"))).withColumn("growth_rate", ((col("monthly_jobs") - col("prev_month_jobs")) / col("prev_month_jobs") * 100))
city_industry_distribution = recruitment_df.groupBy("city", "industry").agg(count("job_id").alias("industry_jobs")).withColumn("city_total_jobs", sum("industry_jobs").over(Window.partitionBy("city"))).withColumn("industry_percentage", col("industry_jobs") / col("city_total_jobs") * 100)
final_city_analysis = hot_cities.join(salary_range_stats, "city", "left").join(city_growth_rate.groupBy("city").agg(avg("growth_rate").alias("avg_growth_rate")), "city", "left")
result_pandas = final_city_analysis.toPandas()
analysis_summary = {"total_cities": city_stats.count(), "total_jobs": recruitment_df.count(), "avg_jobs_per_city": recruitment_df.groupBy("city").count().agg(avg("count")).collect()[0][0], "top_salary_city": city_stats.orderBy(desc("avg_max_salary")).select("city").collect()[0][0]}
return result_pandas, analysis_summary
def industry_scale_analysis():
job_industry_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/recruitment").option("dbtable", "job_info").option("user", "root").option("password", "password").load()
industry_scale = job_industry_df.groupBy("industry").agg(count("job_id").alias("job_count"), countDistinct("company_name").alias("company_count"), avg("salary_min").alias("avg_min_salary"), avg("salary_max").alias("avg_max_salary"))
industry_trend = job_industry_df.withColumn("quarter", concat(year("publish_time"), lit("Q"), quarter("publish_time"))).groupBy("industry", "quarter").agg(count("job_id").alias("quarterly_jobs"))
industry_company_size = job_industry_df.groupBy("industry", "company_size").agg(count("job_id").alias("size_jobs")).withColumn("industry_total", sum("size_jobs").over(Window.partitionBy("industry"))).withColumn("size_percentage", col("size_jobs") / col("industry_total") * 100)
experience_requirement = job_industry_df.groupBy("industry", "experience_required").agg(count("job_id").alias("exp_jobs")).withColumn("industry_exp_total", sum("exp_jobs").over(Window.partitionBy("industry"))).withColumn("exp_percentage", col("exp_jobs") / col("industry_exp_total") * 100)
education_requirement = job_industry_df.groupBy("industry", "education_required").agg(count("job_id").alias("edu_jobs")).withColumn("industry_edu_total", sum("edu_jobs").over(Window.partitionBy("industry"))).withColumn("edu_percentage", col("edu_jobs") / col("industry_edu_total") * 100)
industry_growth = industry_trend.withColumn("prev_quarter_jobs", lag("quarterly_jobs").over(Window.partitionBy("industry").orderBy("quarter"))).withColumn("growth_rate", ((col("quarterly_jobs") - col("prev_quarter_jobs")) / col("prev_quarter_jobs") * 100))
top_industries = industry_scale.orderBy(desc("job_count")).limit(15)
industry_competition_index = industry_scale.withColumn("competition_index", col("job_count") / col("company_count"))
comprehensive_industry_analysis = top_industries.join(industry_growth.groupBy("industry").agg(avg("growth_rate").alias("avg_growth_rate")), "industry", "left").join(industry_competition_index.select("industry", "competition_index"), "industry", "left")
market_concentration = industry_scale.withColumn("market_share", col("job_count") / sum("job_count").over()).filter(col("market_share") >= 0.05)
result_data = comprehensive_industry_analysis.toPandas()
industry_summary = {"total_industries": industry_scale.count(), "largest_industry": industry_scale.orderBy(desc("job_count")).select("industry").collect()[0][0], "highest_paying_industry": industry_scale.orderBy(desc("avg_max_salary")).select("industry").collect()[0][0]}
return result_data, industry_summary
def skill_requirement_analysis():
job_skills_df = spark.read.format("jdbc").option("url", "jdbc://mysql://localhost:3306/recruitment").option("dbtable", "job_skills").option("user", "root").option("password", "password").load()
skill_frequency = job_skills_df.groupBy("skill_name").agg(count("job_id").alias("skill_demand"), countDistinct("job_id").alias("unique_jobs"), countDistinct("company_name").alias("companies_requiring"))
skill_salary_correlation = job_skills_df.join(spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/recruitment").option("dbtable", "job_info").option("user", "root").option("password", "password").load(), "job_id").groupBy("skill_name").agg(avg("salary_min").alias("avg_min_salary"), avg("salary_max").alias("avg_max_salary"), stddev("salary_max").alias("salary_stddev"))
industry_skill_matrix = job_skills_df.join(spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/recruitment").option("dbtable", "job_info").option("user", "root").option("password", "password").load(), "job_id").groupBy("industry", "skill_name").agg(count("job_id").alias("industry_skill_jobs")).withColumn("industry_total_jobs", sum("industry_skill_jobs").over(Window.partitionBy("industry"))).withColumn("skill_penetration_rate", col("industry_skill_jobs") / col("industry_total_jobs") * 100)
skill_combination_analysis = job_skills_df.alias("s1").join(job_skills_df.alias("s2"), col("s1.job_id") == col("s2.job_id")).filter(col("s1.skill_name") < col("s2.skill_name")).groupBy("s1.skill_name", "s2.skill_name").agg(count("s1.job_id").alias("combination_count")).orderBy(desc("combination_count"))
emerging_skills = job_skills_df.join(spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/recruitment").option("dbtable", "job_info").option("user", "root").option("password", "password").load(), "job_id").withColumn("year", year("publish_time")).groupBy("skill_name", "year").agg(count("job_id").alias("yearly_demand")).withColumn("prev_year_demand", lag("yearly_demand").over(Window.partitionBy("skill_name").orderBy("year"))).withColumn("growth_rate", ((col("yearly_demand") - col("prev_year_demand")) / col("prev_year_demand") * 100)).filter(col("growth_rate") > 50)
skill_experience_mapping = job_skills_df.join(spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/recruitment").option("dbtable", "job_info").option("user", "root").option("password", "password").load(), "job_id").groupBy("skill_name", "experience_required").agg(count("job_id").alias("exp_skill_jobs")).withColumn("skill_total_jobs", sum("exp_skill_jobs").over(Window.partitionBy("skill_name"))).withColumn("experience_distribution", col("exp_skill_jobs") / col("skill_total_jobs") * 100)
top_skills = skill_frequency.orderBy(desc("skill_demand")).limit(20)
high_value_skills = skill_salary_correlation.filter(col("avg_max_salary") > 15000).orderBy(desc("avg_max_salary"))
comprehensive_skill_analysis = top_skills.join(skill_salary_correlation, "skill_name", "left").join(emerging_skills.groupBy("skill_name").agg(avg("growth_rate").alias("avg_growth_rate")), "skill_name", "left")
skill_market_analysis = comprehensive_skill_analysis.withColumn("skill_value_index", col("avg_max_salary") * col("skill_demand") / 1000)
result_skill_data = comprehensive_skill_analysis.toPandas()
skill_analysis_summary = {"total_skills": skill_frequency.count(), "most_demanded_skill": skill_frequency.orderBy(desc("skill_demand")).select("skill_name").collect()[0][0], "highest_paying_skill": skill_salary_correlation.orderBy(desc("avg_max_salary")).select("skill_name").collect()[0][0]}
return result_skill_data, skill_analysis_summary
六、部分文档展示
七、END
💕💕文末获取源码联系计算机编程果茶熊