前言
💖💖作者:计算机程序员小杨 💙💙个人简介:我是一名计算机相关专业的从业者,擅长Java、微信小程序、Python、Golang、安卓Android等多个IT方向。会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。热爱技术,喜欢钻研新工具和框架,也乐于通过代码解决实际问题,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💕💕文末获取源码联系 计算机程序员小杨 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 深度学习实战项目 计算机毕业设计选题 💜💜
一.开发工具简介
大数据框架:Hadoop+Spark(本次没用Hive,支持定制) 开发语言:Python+Java(两个版本都支持) 后端框架:Django+Spring Boot(Spring+SpringMVC+Mybatis)(两个版本都支持) 前端:Vue+ElementUI+Echarts+HTML+CSS+JavaScript+jQuery 详细技术点:Hadoop、HDFS、Spark、Spark SQL、Pandas、NumPy 数据库:MySQL
二.系统内容简介
《哔哩哔哩热门视频数据可视化分析系统》是一个基于大数据技术构建的视频平台数据分析系统,采用Hadoop+Spark分布式计算框架作为数据处理核心,结合Django和Spring Boot双后端架构,为用户提供全方位的视频数据洞察服务。系统前端采用Vue+ElementUI构建交互界面,通过Echarts实现数据可视化展示,后端利用Spark SQL和Pandas进行高效的数据处理与分析。系统将海量的视频数据存储在HDFS分布式文件系统中,通过MySQL数据库管理结构化数据,实现了从数据采集、存储、处理到可视化展示的完整数据处理链路。核心功能涵盖创作者行为模式分析、用户互动数据挖掘、视频发布时间规律识别、内容标签特征提取以及地域传播路径分析,为视频平台运营决策和内容创作者提供科学的数据支撑,帮助用户深入理解视频传播规律和用户偏好趋势。
三.系统功能演示
四.系统界面展示
五.系统源码展示
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import numpy as np
spark = SparkSession.builder.appName("BilibiliAnalysis").config("spark.sql.adaptive.enabled", "true").getOrCreate()
def creator_analysis(video_data_path, creator_data_path):
video_df = spark.read.json(video_data_path)
creator_df = spark.read.json(creator_data_path)
joined_df = video_df.join(creator_df, "creator_id", "inner")
creator_stats = joined_df.groupBy("creator_id", "creator_name").agg(
count("video_id").alias("video_count"),
sum("view_count").alias("total_views"),
avg("view_count").alias("avg_views"),
sum("like_count").alias("total_likes"),
sum("comment_count").alias("total_comments"),
sum("share_count").alias("total_shares")
)
creator_engagement = creator_stats.withColumn("engagement_rate",
(col("total_likes") + col("total_comments") + col("total_shares")) / col("total_views"))
creator_performance = creator_engagement.withColumn("performance_score",
col("avg_views") * 0.4 + col("engagement_rate") * 1000 * 0.6)
creator_ranking = creator_performance.orderBy(desc("performance_score"))
creator_trend = joined_df.withColumn("publish_month",
date_format(col("publish_time"), "yyyy-MM")).groupBy("creator_id", "publish_month").agg(
count("video_id").alias("monthly_videos"),
avg("view_count").alias("monthly_avg_views")
).orderBy("creator_id", "publish_month")
creator_category_analysis = joined_df.groupBy("creator_id", "category").agg(
count("video_id").alias("category_videos"),
avg("view_count").alias("category_avg_views")
).withColumn("category_rank", row_number().over(Window.partitionBy("creator_id").orderBy(desc("category_avg_views"))))
return creator_ranking.toPandas(), creator_trend.toPandas(), creator_category_analysis.toPandas()
def user_behavior_analysis(interaction_data_path, user_data_path):
interaction_df = spark.read.json(interaction_data_path)
user_df = spark.read.json(user_data_path)
user_interaction = interaction_df.join(user_df, "user_id", "inner")
user_activity_patterns = user_interaction.withColumn("hour", hour(col("interaction_time"))).groupBy("user_id", "hour").agg(
count("interaction_id").alias("hourly_interactions"),
countDistinct("video_id").alias("unique_videos_watched")
)
user_preference_analysis = user_interaction.groupBy("user_id", "category").agg(
count("interaction_id").alias("category_interactions"),
avg("watch_duration").alias("avg_watch_duration"),
sum(when(col("interaction_type") == "like", 1).otherwise(0)).alias("likes_given"),
sum(when(col("interaction_type") == "comment", 1).otherwise(0)).alias("comments_made"),
sum(when(col("interaction_type") == "share", 1).otherwise(0)).alias("shares_made")
)
user_engagement_score = user_preference_analysis.withColumn("engagement_score",
col("likes_given") * 1 + col("comments_made") * 3 + col("shares_made") * 5)
user_loyalty_metrics = user_interaction.groupBy("user_id").agg(
countDistinct("creator_id").alias("creators_followed"),
datediff(max("interaction_time"), min("interaction_time")).alias("activity_span_days"),
count("interaction_id").alias("total_interactions")
).withColumn("loyalty_score", col("total_interactions") / (col("activity_span_days") + 1))
user_cohort_analysis = user_interaction.withColumn("cohort_month",
date_format(col("first_interaction_time"), "yyyy-MM")).withColumn("period_number",
datediff(col("interaction_time"), col("first_interaction_time")) / 30).groupBy("cohort_month", "period_number").agg(
countDistinct("user_id").alias("active_users")
)
return user_activity_patterns.toPandas(), user_engagement_score.toPandas(), user_loyalty_metrics.toPandas()
def time_pattern_analysis(video_data_path, interaction_data_path):
video_df = spark.read.json(video_data_path)
interaction_df = spark.read.json(interaction_data_path)
hourly_publish_pattern = video_df.withColumn("publish_hour", hour(col("publish_time"))).groupBy("publish_hour").agg(
count("video_id").alias("videos_published"),
avg("view_count").alias("avg_views_by_hour"),
avg("like_count").alias("avg_likes_by_hour")
).withColumn("performance_index", col("avg_views_by_hour") / col("videos_published"))
daily_trend_analysis = video_df.withColumn("publish_date", date_format(col("publish_time"), "yyyy-MM-dd")).groupBy("publish_date").agg(
count("video_id").alias("daily_videos"),
sum("view_count").alias("daily_total_views"),
avg("view_count").alias("daily_avg_views")
).withColumn("trend_momentum", lag("daily_avg_views", 1).over(Window.orderBy("publish_date")))
weekly_pattern = video_df.withColumn("day_of_week", dayofweek(col("publish_time"))).groupBy("day_of_week").agg(
count("video_id").alias("weekly_videos"),
avg("view_count").alias("weekly_avg_views"),
stddev("view_count").alias("view_volatility")
)
seasonal_analysis = video_df.withColumn("season",
when(month(col("publish_time")).isin([12, 1, 2]), "Winter")
.when(month(col("publish_time")).isin([3, 4, 5]), "Spring")
.when(month(col("publish_time")).isin([6, 7, 8]), "Summer")
.otherwise("Autumn")).groupBy("season", "category").agg(
count("video_id").alias("seasonal_videos"),
avg("view_count").alias("seasonal_avg_views"),
max("view_count").alias("seasonal_peak_views")
)
interaction_timing = interaction_df.withColumn("interaction_hour", hour(col("interaction_time"))).groupBy("interaction_hour", "interaction_type").agg(
count("interaction_id").alias("interaction_count")
).withColumn("peak_interaction_ratio", col("interaction_count") / sum("interaction_count").over(Window.partitionBy("interaction_type")))
optimal_publish_times = hourly_publish_pattern.join(weekly_pattern.select("day_of_week", "weekly_avg_views"),
hourly_publish_pattern.publish_hour == weekly_pattern.day_of_week, "cross").withColumn("combined_score",
col("performance_index") * 0.7 + col("weekly_avg_views") * 0.3)
return hourly_publish_pattern.toPandas(), seasonal_analysis.toPandas(), optimal_publish_times.toPandas()
六.系统文档展示
结束
💕💕文末获取源码联系 计算机程序员小杨