【大数据】B站热门视频评论情感可视化分析系统 计算机项目 Hadoop+Spark环境配置 数据科学与大数据技术 附源码+文档+讲解

55 阅读6分钟

一、个人简介

💖💖作者:计算机编程果茶熊 💙💙个人简介:曾长期从事计算机专业培训教学,担任过编程老师,同时本人也热爱上课教学,擅长Java、微信小程序、Python、Golang、安卓Android等多个IT方向。会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 计算机毕业设计选题 💕💕文末获取源码联系计算机编程果茶熊

二、系统介绍

大数据框架:Hadoop+Spark(Hive需要定制修改) 开发语言:Java+Python(两个版本都支持) 数据库:MySQL 后端框架:SpringBoot(Spring+SpringMVC+Mybatis)+Django(两个版本都支持) 前端:Vue+Echarts+HTML+CSS+JavaScript+jQuery

《B站热门视频评论情感可视化分析系统》是一个基于大数据技术的智能分析平台,专门针对B站热门视频的用户评论进行深度挖掘和情感分析。系统采用Hadoop分布式存储架构和Spark大数据处理引擎作为底层技术支撑,运用Python进行数据处理和算法实现,结合Django后端框架提供稳定的服务接口,前端采用Vue配合ElementUI组件库和Echarts可视化工具构建用户交互界面。系统核心功能涵盖视频互动特征分析、评论情感倾向分析、用户评论热点分析、评论时间分布分析等多个维度,通过Spark SQL进行高效的数据查询处理,利用Pandas和NumPy进行数据清洗和统计分析,将复杂的评论数据转化为直观的可视化图表。系统能够自动抓取B站热门视频的评论数据,运用自然语言处理技术识别评论的情感极性,挖掘用户关注的热点话题,分析评论的时间分布规律,为内容创作者、平台运营者和研究人员提供数据驱动的决策支持。

三、视频解说

B站热门视频评论情感可视化分析系统

四、部分功能展示

在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述

五、部分代码展示


from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, when, regexp_replace, split, explode, desc, asc
from pyspark.sql.types import StringType, IntegerType, FloatType, StructType, StructField
import pandas as pd
import numpy as np
import re
import jieba
from datetime import datetime, timedelta
from collections import Counter
import json

spark = SparkSession.builder.appName("BilibiliCommentAnalysis").config("spark.sql.adaptive.enabled", "true").config("spark.sql.adaptive.coalescePartitions.enabled", "true").getOrCreate()

def analyze_video_interaction_features(video_id):
    comments_df = spark.sql(f"SELECT * FROM comments WHERE video_id = '{video_id}'")
    total_comments = comments_df.count()
    avg_comment_length = comments_df.select(avg(col("content").length())).collect()[0][0]
    reply_rate = comments_df.filter(col("reply_count") > 0).count() / total_comments if total_comments > 0 else 0
    like_distribution = comments_df.select("like_count").rdd.map(lambda x: x[0]).collect()
    like_quartiles = np.percentile(like_distribution, [25, 50, 75]) if like_distribution else [0, 0, 0]
    interaction_score = (reply_rate * 0.4 + (avg_comment_length / 100) * 0.3 + (like_quartiles[1] / 1000) * 0.3) * 100
    time_intervals = comments_df.select("comment_time").rdd.map(lambda x: x[0]).collect()
    if len(time_intervals) > 1:
        time_diffs = [abs((time_intervals[i] - time_intervals[i-1]).total_seconds()) for i in range(1, len(time_intervals))]
        avg_interval = sum(time_diffs) / len(time_diffs) / 3600
    else:
        avg_interval = 0
    engagement_periods = comments_df.withColumn("hour", col("comment_time").hour()).groupBy("hour").count().orderBy(desc("count"))
    peak_hours = engagement_periods.limit(3).select("hour").rdd.map(lambda x: x[0]).collect()
    user_participation = comments_df.groupBy("user_id").count().filter(col("count") > 1).count()
    repeat_user_rate = user_participation / comments_df.select("user_id").distinct().count() if comments_df.select("user_id").distinct().count() > 0 else 0
    content_diversity = comments_df.select("content").rdd.map(lambda x: len(set(jieba.cut(x[0])))).collect()
    avg_word_diversity = sum(content_diversity) / len(content_diversity) if content_diversity else 0
    quality_comments = comments_df.filter((col("content").length() > 10) & (col("like_count") > 5)).count()
    quality_rate = quality_comments / total_comments if total_comments > 0 else 0
    result = {
        "video_id": video_id,
        "total_comments": total_comments,
        "avg_comment_length": round(avg_comment_length, 2),
        "reply_rate": round(reply_rate, 4),
        "like_quartiles": like_quartiles.tolist(),
        "interaction_score": round(interaction_score, 2),
        "avg_comment_interval_hours": round(avg_interval, 2),
        "peak_engagement_hours": peak_hours,
        "repeat_user_rate": round(repeat_user_rate, 4),
        "avg_word_diversity": round(avg_word_diversity, 2),
        "quality_comment_rate": round(quality_rate, 4)
    }
    return result

def analyze_comment_sentiment(video_id):
    comments_df = spark.sql(f"SELECT comment_id, content, like_count, reply_count FROM comments WHERE video_id = '{video_id}'")
    positive_words = ["好", "棒", "赞", "喜欢", "优秀", "精彩", "完美", "不错", "厉害", "牛", "强", "爱了", "太好了", "满意", "推荐"]
    negative_words = ["差", "烂", "讨厌", "垃圾", "无聊", "失望", "糟糕", "不行", "难看", "恶心", "坑", "骗", "假", "坏", "恶劣"]
    neutral_words = ["还行", "一般", "普通", "凑合", "平常", "正常", "可以", "尚可"]
    def calculate_sentiment_score(content):
        words = list(jieba.cut(content))
        positive_count = sum(1 for word in words if word in positive_words)
        negative_count = sum(1 for word in words if word in negative_words)
        neutral_count = sum(1 for word in words if word in neutral_words)
        total_sentiment_words = positive_count + negative_count + neutral_count
        if total_sentiment_words == 0:
            return 0.0, "neutral"
        sentiment_score = (positive_count - negative_count) / total_sentiment_words
        if sentiment_score > 0.1:
            return sentiment_score, "positive"
        elif sentiment_score < -0.1:
            return sentiment_score, "negative"
        else:
            return sentiment_score, "neutral"
    sentiment_results = []
    for row in comments_df.collect():
        comment_id, content, like_count, reply_count = row
        score, label = calculate_sentiment_score(content)
        weight = 1 + (like_count * 0.1) + (reply_count * 0.05)
        weighted_score = score * weight
        sentiment_results.append({
            "comment_id": comment_id,
            "sentiment_score": round(score, 4),
            "weighted_score": round(weighted_score, 4),
            "sentiment_label": label,
            "confidence": abs(score)
        })
    sentiment_df = pd.DataFrame(sentiment_results)
    positive_count = len(sentiment_df[sentiment_df['sentiment_label'] == 'positive'])
    negative_count = len(sentiment_df[sentiment_df['sentiment_label'] == 'negative'])
    neutral_count = len(sentiment_df[sentiment_df['sentiment_label'] == 'neutral'])
    total_count = len(sentiment_df)
    overall_sentiment = sentiment_df['weighted_score'].mean() if total_count > 0 else 0
    sentiment_distribution = {
        "positive_rate": round(positive_count / total_count, 4) if total_count > 0 else 0,
        "negative_rate": round(negative_count / total_count, 4) if total_count > 0 else 0,
        "neutral_rate": round(neutral_count / total_count, 4) if total_count > 0 else 0
    }
    high_confidence_sentiments = sentiment_df[sentiment_df['confidence'] > 0.3]
    reliable_sentiment_rate = len(high_confidence_sentiments) / total_count if total_count > 0 else 0
    time_based_sentiment = spark.sql(f"SELECT DATE(comment_time) as date, content FROM comments WHERE video_id = '{video_id}' ORDER BY comment_time")
    daily_sentiment_trends = []
    for date_row in time_based_sentiment.select("date").distinct().collect():
        date_comments = time_based_sentiment.filter(col("date") == date_row[0])
        date_sentiments = [calculate_sentiment_score(row[1])[0] for row in date_comments.select("content").collect()]
        avg_daily_sentiment = sum(date_sentiments) / len(date_sentiments) if date_sentiments else 0
        daily_sentiment_trends.append({"date": str(date_row[0]), "avg_sentiment": round(avg_daily_sentiment, 4)})
    return {
        "video_id": video_id,
        "overall_sentiment_score": round(overall_sentiment, 4),
        "sentiment_distribution": sentiment_distribution,
        "total_analyzed_comments": total_count,
        "reliable_sentiment_rate": round(reliable_sentiment_rate, 4),
        "daily_sentiment_trends": daily_sentiment_trends,
        "detailed_sentiments": sentiment_results
    }

def analyze_comment_hotspots(video_id):
    comments_df = spark.sql(f"SELECT content, like_count, reply_count, comment_time FROM comments WHERE video_id = '{video_id}'")
    stop_words = ["的", "了", "在", "是", "我", "有", "和", "就", "不", "人", "都", "一", "个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这个"]
    all_words = []
    comment_contents = comments_df.select("content").rdd.map(lambda x: x[0]).collect()
    for content in comment_contents:
        words = jieba.cut(content)
        filtered_words = [word for word in words if len(word) > 1 and word not in stop_words and re.match(r'^[\u4e00-\u9fa5a-zA-Z]+, word)]
        all_words.extend(filtered_words)
    word_frequency = Counter(all_words)
    top_keywords = word_frequency.most_common(20)
    keyword_trends = {}
    for keyword, freq in top_keywords:
        keyword_comments = comments_df.filter(col("content").contains(keyword))
        avg_likes = keyword_comments.select(avg("like_count")).collect()[0][0] or 0
        avg_replies = keyword_comments.select(avg("reply_count")).collect()[0][0] or 0
        keyword_trends[keyword] = {
            "frequency": freq,
            "avg_likes": round(avg_likes, 2),
            "avg_replies": round(avg_replies, 2),
            "popularity_score": round(freq * 0.5 + avg_likes * 0.3 + avg_replies * 0.2, 2)
        }
    time_based_topics = comments_df.withColumn("hour", col("comment_time").hour()).groupBy("hour").agg(collect_list("content").alias("contents"))
    hourly_hotspots = {}
    for hour_row in time_based_topics.collect():
        hour, contents = hour_row
        hour_words = []
        for content in contents:
            words = jieba.cut(content)
            filtered_words = [word for word in words if len(word) > 1 and word not in stop_words]
            hour_words.extend(filtered_words)
        hour_word_freq = Counter(hour_words)
        hourly_hotspots[hour] = hour_word_freq.most_common(5)
    high_engagement_comments = comments_df.filter((col("like_count") > 10) | (col("reply_count") > 5))
    trending_topics = []
    for content_row in high_engagement_comments.select("content", "like_count", "reply_count").collect():
        content, likes, replies = content_row
        words = jieba.cut(content)
        for word in words:
            if len(word) > 1 and word not in stop_words:
                engagement_score = likes * 2 + replies * 3
                trending_topics.append({"topic": word, "engagement": engagement_score, "content": content[:50]})
    trending_df = pd.DataFrame(trending_topics)
    if not trending_df.empty:
        topic_engagement = trending_df.groupby('topic')['engagement'].sum().sort_values(ascending=False).head(10)
        top_trending = [{"topic": topic, "total_engagement": int(engagement)} for topic, engagement in topic_engagement.items()]
    else:
        top_trending = []
    topic_clusters = {}
    for keyword, data in keyword_trends.items():
        related_comments = comments_df.filter(col("content").contains(keyword)).select("content").rdd.map(lambda x: x[0]).collect()
        related_words = []
        for comment in related_comments:
            words = jieba.cut(comment)
            related_words.extend([w for w in words if w != keyword and len(w) > 1 and w not in stop_words])
        related_freq = Counter(related_words)
        topic_clusters[keyword] = related_freq.most_common(5)
    return {
        "video_id": video_id,
        "top_keywords": [{"word": word, "frequency": freq} for word, freq in top_keywords],
        "keyword_trends": keyword_trends,
        "hourly_hotspots": {str(hour): topics for hour, topics in hourly_hotspots.items()},
        "trending_topics": top_trending,
        "topic_clusters": topic_clusters,
        "total_unique_words": len(word_frequency),
        "word_diversity_score": round(len(word_frequency) / len(all_words), 4) if all_words else 0
    }

六、部分文档展示

在这里插入图片描述

七、END

💕💕文末获取源码联系计算机编程果茶熊