💖💖作者:计算机毕业设计江挽 💙💙个人简介:曾长期从事计算机专业培训教学,本人也热爱上课教学,语言擅长Java、微信小程序、Python、Golang、安卓Android等,开发项目包括大数据、深度学习、网站、小程序、安卓、算法。平常会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 深度学习实战项目
基于大数据的B站热门视频评论情感可视化分析系统介绍
基于大数据的B站热门视频评论情感可视化分析系统是一个集数据采集、处理、分析与可视化展示于一体的综合性平台。系统采用Hadoop+Spark大数据技术架构,能够高效处理海量B站视频评论数据,通过Django后端框架提供稳定的数据服务,前端使用Vue+ElementUI+Echarts技术栈实现友好的交互界面和丰富的图表展示效果。系统核心功能包括视频互动特征分析、评论情感倾向分析、用户评论热点分析和评论时间分布分析等模块,能够从多个维度深入挖掘B站用户评论数据的价值。平台通过大屏可视化模块将分析结果以直观的图表形式呈现,帮助用户快速了解视频内容的受众反馈情况。系统还提供用户管理、个人中心、通知公告等基础功能,确保平台的完整性和实用性。整个系统基于MySQL数据库进行数据存储,利用Spark SQL进行高效的数据查询和分析,结合Pandas、NumPy等数据处理库实现复杂的情感分析算法,为用户提供准确可靠的数据分析结果。
基于大数据的B站热门视频评论情感可视化分析系统演示视频
基于大数据的B站热门视频评论情感可视化分析系统演示图片
基于大数据的B站热门视频评论情感可视化分析系统代码展示
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, when, regexp_extract, split, explode
from textblob import TextBlob
import jieba
from collections import Counter
import pandas as pd
import numpy as np
spark = SparkSession.builder.appName("BilibiliCommentAnalysis").config("spark.sql.adaptive.enabled", "true").config("spark.sql.adaptive.coalescePartitions.enabled", "true").getOrCreate()
def analyze_comment_sentiment(video_id):
comments_df = spark.sql(f"SELECT comment_id, user_id, content, like_count, reply_count, publish_time FROM comments WHERE video_id = '{video_id}'")
def get_sentiment_score(text):
if not text or len(text.strip()) == 0:
return 0.0
try:
blob = TextBlob(text)
sentiment_score = blob.sentiment.polarity
return float(sentiment_score)
except:
return 0.0
sentiment_udf = spark.udf.register("get_sentiment", get_sentiment_score)
sentiment_df = comments_df.withColumn("sentiment_score", sentiment_udf(col("content")))
sentiment_df = sentiment_df.withColumn("sentiment_label",
when(col("sentiment_score") > 0.1, "positive")
.when(col("sentiment_score") < -0.1, "negative")
.otherwise("neutral"))
sentiment_stats = sentiment_df.groupBy("sentiment_label").agg(
count("*").alias("count"),
avg("like_count").alias("avg_likes"),
avg("reply_count").alias("avg_replies")
).collect()
total_comments = sentiment_df.count()
result = {}
for row in sentiment_stats:
sentiment = row["sentiment_label"]
count_val = row["count"]
percentage = (count_val / total_comments) * 100 if total_comments > 0 else 0
result[sentiment] = {
"count": count_val,
"percentage": round(percentage, 2),
"avg_likes": round(row["avg_likes"], 2),
"avg_replies": round(row["avg_replies"], 2)
}
hourly_sentiment = sentiment_df.withColumn("hour", regexp_extract(col("publish_time"), r"(\d{2}):", 1))
hourly_stats = hourly_sentiment.groupBy("hour", "sentiment_label").count().collect()
return {"sentiment_distribution": result, "hourly_sentiment": hourly_stats}
def analyze_video_interaction_features(video_id):
comments_df = spark.sql(f"SELECT user_id, content, like_count, reply_count, publish_time FROM comments WHERE video_id = '{video_id}'")
user_stats = comments_df.groupBy("user_id").agg(
count("*").alias("comment_count"),
avg("like_count").alias("avg_likes"),
avg("reply_count").alias("avg_replies")
)
active_users = user_stats.filter(col("comment_count") >= 3).count()
total_users = user_stats.count()
user_engagement_rate = (active_users / total_users) * 100 if total_users > 0 else 0
like_distribution = comments_df.select("like_count").describe().collect()
reply_distribution = comments_df.select("reply_count").describe().collect()
interaction_levels = comments_df.withColumn("interaction_level",
when(col("like_count") >= 100, "high")
.when(col("like_count") >= 10, "medium")
.otherwise("low"))
level_stats = interaction_levels.groupBy("interaction_level").count().collect()
time_pattern = comments_df.withColumn("hour", regexp_extract(col("publish_time"), r"(\d{2}):", 1))
peak_hours = time_pattern.groupBy("hour").count().orderBy(col("count").desc()).limit(5).collect()
comment_length = comments_df.withColumn("content_length",
when(col("content").isNotNull(),
spark.sql("SELECT LENGTH(content) FROM comments WHERE video_id = '{}' LIMIT 1".format(video_id)).collect()[0][0])
.otherwise(0))
avg_comment_length = comment_length.agg(avg("content_length")).collect()[0][0]
return {
"user_engagement_rate": round(user_engagement_rate, 2),
"like_stats": {row["summary"]: row["like_count"] for row in like_distribution},
"reply_stats": {row["summary"]: row["reply_count"] for row in reply_distribution},
"interaction_levels": {row["interaction_level"]: row["count"] for row in level_stats},
"peak_hours": [{"hour": row["hour"], "count": row["count"]} for row in peak_hours],
"avg_comment_length": round(avg_comment_length, 2) if avg_comment_length else 0
}
def analyze_comment_hotspots(video_id):
comments_df = spark.sql(f"SELECT content, like_count, reply_count, publish_time FROM comments WHERE video_id = '{video_id}' AND content IS NOT NULL")
def extract_keywords(text):
if not text or len(text.strip()) == 0:
return []
try:
words = jieba.lcut(text)
filtered_words = [word for word in words if len(word) > 1 and word.isalnum()]
return filtered_words[:10]
except:
return []
keywords_udf = spark.udf.register("extract_keywords", extract_keywords)
keywords_df = comments_df.withColumn("keywords", keywords_udf(col("content")))
keywords_exploded = keywords_df.select(explode(col("keywords")).alias("keyword"), col("like_count"), col("reply_count"))
keyword_stats = keywords_exploded.groupBy("keyword").agg(
count("*").alias("frequency"),
avg("like_count").alias("avg_likes"),
avg("reply_count").alias("avg_replies")
).filter(col("frequency") >= 5).orderBy(col("frequency").desc()).limit(20)
hot_keywords = keyword_stats.collect()
high_interaction_comments = comments_df.filter(col("like_count") >= 50).select("content", "like_count", "reply_count").limit(10).collect()
time_hotspots = comments_df.withColumn("time_period",
when(regexp_extract(col("publish_time"), r"(\d{2}):", 1).cast("int").between(0, 6), "dawn")
.when(regexp_extract(col("publish_time"), r"(\d{2}):", 1).cast("int").between(7, 12), "morning")
.when(regexp_extract(col("publish_time"), r"(\d{2}):", 1).cast("int").between(13, 18), "afternoon")
.otherwise("evening"))
period_activity = time_hotspots.groupBy("time_period").agg(
count("*").alias("comment_count"),
avg("like_count").alias("avg_likes")
).collect()
topic_clusters = comments_df.filter(col("like_count") >= 20)
cluster_keywords = topic_clusters.select("content").limit(50).collect()
all_cluster_text = " ".join([row["content"] for row in cluster_keywords if row["content"]])
if all_cluster_text:
cluster_words = jieba.lcut(all_cluster_text)
topic_words = [word for word in cluster_words if len(word) > 1]
topic_counter = Counter(topic_words)
trending_topics = topic_counter.most_common(15)
else:
trending_topics = []
return {
"hot_keywords": [{"keyword": row["keyword"], "frequency": row["frequency"], "avg_likes": round(row["avg_likes"], 2)} for row in hot_keywords],
"high_interaction_comments": [{"content": row["content"][:100], "likes": row["like_count"], "replies": row["reply_count"]} for row in high_interaction_comments],
"time_period_activity": [{"period": row["time_period"], "count": row["comment_count"], "avg_likes": round(row["avg_likes"], 2)} for row in period_activity],
"trending_topics": [{"topic": topic, "count": count} for topic, count in trending_topics]
}
基于大数据的B站热门视频评论情感可视化分析系统文档展示
💖💖作者:计算机毕业设计江挽 💙💙个人简介:曾长期从事计算机专业培训教学,本人也热爱上课教学,语言擅长Java、微信小程序、Python、Golang、安卓Android等,开发项目包括大数据、深度学习、网站、小程序、安卓、算法。平常会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 深度学习实战项目