前言
💖💖作者:计算机程序员小杨 💙💙个人简介:我是一名计算机相关专业的从业者,擅长Java、微信小程序、Python、Golang、安卓Android等多个IT方向。会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。热爱技术,喜欢钻研新工具和框架,也乐于通过代码解决实际问题,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💕💕文末获取源码联系 计算机程序员小杨 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 深度学习实战项目 计算机毕业设计选题 💜💜
一.开发工具简介
大数据框架:Hadoop+Spark(本次没用Hive,支持定制) 开发语言:Python+Java(两个版本都支持) 后端框架:Django+Spring Boot(Spring+SpringMVC+Mybatis)(两个版本都支持) 前端:Vue+ElementUI+Echarts+HTML+CSS+JavaScript+jQuery 详细技术点:Hadoop、HDFS、Spark、Spark SQL、Pandas、NumPy 数据库:MySQL
二.系统内容简介
全球大学排名数据可视化分析系统是基于Hadoop+Spark大数据技术栈构建的智能分析平台,采用Django作为后端框架,Vue+ElementUI+Echarts实现前端交互与数据可视化展示。系统通过HDFS存储海量高校排名数据,利用Spark SQL和Spark MLlib进行分布式计算与机器学习建模,结合Pandas和NumPy完成数据清洗、统计分析等预处理工作。系统核心功能涵盖用户权限管理、全球大学排名数据的增删改查、基于K-Means等算法的院校聚类分析、多维度竞争实力对比、全球高等教育格局态势分析、区域内院校竞争力评估以及综合数据大屏展示。通过对QS、Times等权威排名数据进行多源融合与深度挖掘,系统能够揭示不同国家和地区高校的发展特征、学科优势分布及竞争态势变化趋势,为教育管理部门政策制定、高校战略规划以及学生择校决策提供数据支撑,实现从原始数据采集到智能分析决策的全流程闭环。
三.系统功能演示
四.系统界面展示
五.系统源码展示
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import col, avg, rank, dense_rank, count, sum as spark_sum, when
from pyspark.sql.window import Window
from django.http import JsonResponse
from django.views import View
import pandas as pd
import numpy as np
import json
spark = SparkSession.builder.appName("GlobalUniversityRanking").config("spark.executor.memory", "4g").config("spark.driver.memory", "2g").getOrCreate()
class UniversityClusteringView(View):
def post(self, request):
params = json.loads(request.body)
cluster_num = params.get('cluster_num', 5)
features = params.get('features', ['academic_score', 'employer_score', 'faculty_ratio', 'citations_score', 'international_score'])
df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/university_db").option("dbtable", "global_university_ranking").option("user", "root").option("password", "password").load()
df_filtered = df.filter(col("ranking_year") == 2024).na.drop(subset=features)
assembler = VectorAssembler(inputCols=features, outputCol="raw_features")
df_assembled = assembler.transform(df_filtered)
scaler = StandardScaler(inputCol="raw_features", outputCol="features", withStd=True, withMean=True)
scaler_model = scaler.fit(df_assembled)
df_scaled = scaler_model.transform(df_assembled)
kmeans = KMeans(k=cluster_num, seed=42, maxIter=50, featuresCol="features", predictionCol="cluster")
kmeans_model = kmeans.fit(df_scaled)
df_clustered = kmeans_model.transform(df_scaled)
cluster_stats = df_clustered.groupBy("cluster").agg(count("university_id").alias("university_count"),avg("academic_score").alias("avg_academic"),avg("employer_score").alias("avg_employer"),avg("faculty_ratio").alias("avg_faculty"),avg("citations_score").alias("avg_citations"),avg("international_score").alias("avg_international"))
cluster_result = df_clustered.select("university_id", "university_name", "country", "cluster").toPandas()
stats_result = cluster_stats.toPandas()
result_data = {"cluster_distribution": cluster_result.to_dict(orient='records'),"cluster_statistics": stats_result.to_dict(orient='records'),"cluster_centers": kmeans_model.clusterCenters()}
return JsonResponse({"code": 200, "message": "聚类分析完成", "data": result_data})
class CompetitivenessAnalysisView(View):
def post(self, request):
params = json.loads(request.body)
target_university_id = params.get('university_id')
compare_universities = params.get('compare_list', [])
year_range = params.get('year_range', [2020, 2024])
df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/university_db").option("dbtable", "global_university_ranking").option("user", "root").option("password", "password").load()
all_universities = [target_university_id] + compare_universities
df_filtered = df.filter((col("university_id").isin(all_universities)) & (col("ranking_year").between(year_range[0], year_range[1])))
window_spec = Window.partitionBy("ranking_year").orderBy(col("overall_score").desc())
df_ranked = df_filtered.withColumn("global_rank_position", rank().over(window_spec))
trend_df = df_ranked.groupBy("university_id", "university_name", "ranking_year").agg(avg("overall_score").alias("avg_overall_score"),avg("academic_score").alias("avg_academic_score"),avg("global_rank_position").alias("avg_rank"))
trend_result = trend_df.orderBy("university_id", "ranking_year").toPandas()
strength_cols = ["academic_score", "employer_score", "faculty_ratio", "citations_score", "international_score", "research_output"]
latest_year = year_range[1]
df_latest = df_filtered.filter(col("ranking_year") == latest_year)
radar_data = df_latest.select("university_id", "university_name", *strength_cols).toPandas()
radar_normalized = radar_data.copy()
for col_name in strength_cols:
max_val = radar_normalized[col_name].max()
min_val = radar_normalized[col_name].min()
radar_normalized[col_name] = (radar_normalized[col_name] - min_val) / (max_val - min_val) * 100 if max_val != min_val else 50
competitive_index = df_latest.withColumn("competitive_score",(col("academic_score") * 0.3 + col("employer_score") * 0.2 + col("citations_score") * 0.25 + col("international_score") * 0.15 + col("faculty_ratio") * 0.1))
competitive_result = competitive_index.select("university_id", "university_name", "competitive_score").orderBy(col("competitive_score").desc()).toPandas()
result_data = {"trend_analysis": trend_result.to_dict(orient='records'),"strength_radar": radar_normalized.to_dict(orient='records'),"competitive_ranking": competitive_result.to_dict(orient='records')}
return JsonResponse({"code": 200, "message": "竞争实力分析完成", "data": result_data})
class GlobalPatternAnalysisView(View):
def post(self, request):
params = json.loads(request.body)
analysis_year = params.get('year', 2024)
top_n = params.get('top_n', 500)
df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/university_db").option("dbtable", "global_university_ranking").option("user", "root").option("password", "password").load()
df_year = df.filter((col("ranking_year") == analysis_year) & (col("global_rank") <= top_n))
country_stats = df_year.groupBy("country").agg(count("university_id").alias("university_count"),avg("overall_score").alias("avg_score"),spark_sum(when(col("global_rank") <= 50, 1).otherwise(0)).alias("top50_count"),spark_sum(when(col("global_rank") <= 100, 1).otherwise(0)).alias("top100_count"))
country_result = country_stats.orderBy(col("university_count").desc()).toPandas()
region_mapping = {"USA": "North America", "Canada": "North America", "UK": "Europe", "Germany": "Europe", "France": "Europe", "China": "Asia", "Japan": "Asia", "Singapore": "Asia", "Australia": "Oceania"}
df_with_region = df_year.withColumn("region", when(col("country") == "USA", "North America").when(col("country") == "Canada", "North America").when(col("country").isin(["UK", "Germany", "France", "Netherlands", "Switzerland"]), "Europe").when(col("country").isin(["China", "Japan", "Singapore", "South Korea", "Hong Kong"]), "Asia").when(col("country").isin(["Australia", "New Zealand"]), "Oceania").otherwise("Other"))
region_stats = df_with_region.groupBy("region").agg(count("university_id").alias("university_count"),avg("overall_score").alias("avg_score"),avg("academic_score").alias("avg_academic"),avg("research_output").alias("avg_research"))
region_result = region_stats.orderBy(col("university_count").desc()).toPandas()
discipline_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/university_db").option("dbtable", "university_discipline_ranking").option("user", "root").option("password", "password").load()
discipline_filtered = discipline_df.filter((col("ranking_year") == analysis_year) & (col("discipline_rank") <= 100))
discipline_country = discipline_filtered.groupBy("country", "discipline_name").agg(count("university_id").alias("discipline_count"))
window_discipline = Window.partitionBy("discipline_name").orderBy(col("discipline_count").desc())
discipline_top = discipline_country.withColumn("rank", dense_rank().over(window_discipline)).filter(col("rank") <= 5)
discipline_result = discipline_top.orderBy("discipline_name", "rank").toPandas()
result_data = {"country_distribution": country_result.to_dict(orient='records'),"region_analysis": region_result.to_dict(orient='records'),"discipline_advantage": discipline_result.to_dict(orient='records')}
return JsonResponse({"code": 200, "message": "全球格局分析完成", "data": result_data})
六.系统文档展示
结束
💕💕文末获取源码联系 计算机程序员小杨