💖💖作者:计算机毕业设计杰瑞 💙💙个人简介:曾长期从事计算机专业培训教学,本人也热爱上课教学,语言擅长Java、微信小程序、Python、Golang、安卓Android等,开发项目包括大数据、深度学习、网站、小程序、安卓、算法。平常会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 深度学校实战项目 计算机毕业设计选题推荐
基于大数据的社交媒体舆情数据可视化分析系统介绍
《基于大数据的社交媒体舆情数据可视化分析系统》是一套完整的舆情监测与分析解决方案,采用Hadoop分布式文件系统作为底层存储架构,配合Spark计算引擎实现海量社交媒体数据的高效处理。系统提供Python+Django和Java+SpringBoot两种技术实现版本,前端采用Vue+ElementUI构建交互界面,通过Echarts图表库将分析结果以直观的可视化形式展现。核心功能模块包括系统首页的数据概览、个人信息管理、系统配置管理以及多维度数据分析功能。系统运用Spark SQL进行结构化数据查询,结合Pandas和NumPy完成数据清洗与统计计算,能够对社交媒体平台产生的文本、用户行为等数据进行情感分析、热点话题挖掘和传播趋势预测。数据存储层面通过HDFS实现分布式存储,保证数据的高可靠性和可扩展性,MySQL数据库负责存储用户信息和系统配置等结构化数据。整个系统架构清晰,技术栈完整,既能满足毕业设计对大数据技术应用的要求,又具备实际的舆情监测价值,适合作为计算机专业学生深入学习大数据处理流程的实践项目。
基于大数据的社交媒体舆情数据可视化分析系统演示视频
基于大数据的社交媒体舆情数据可视化分析系统演示图片
基于大数据的社交媒体舆情数据可视化分析系统代码展示
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, regexp_replace, lower, trim, when, year, month, dayofmonth, hour, sum as spark_sum, desc, row_number
from pyspark.sql.window import Window
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import re
from collections import Counter
spark = SparkSession.builder.appName("SocialMediaSentimentAnalysis").config("spark.sql.warehouse.dir", "/user/hive/warehouse").config("spark.executor.memory", "4g").config("spark.driver.memory", "2g").getOrCreate()
def sentiment_analysis_pipeline(hdfs_data_path, start_date, end_date, keyword_filter=None):
raw_df = spark.read.option("header", "true").option("inferSchema", "true").csv(hdfs_data_path)
cleaned_df = raw_df.withColumn("content_clean", regexp_replace(col("content"), r'[^\u4e00-\u9fa5a-zA-Z0-9\s]', '')).withColumn("content_clean", trim(lower(col("content_clean")))).filter(col("content_clean") != "").filter((col("publish_time") >= start_date) & (col("publish_time") <= end_date))
if keyword_filter:
cleaned_df = cleaned_df.filter(col("content_clean").contains(keyword_filter))
sentiment_keywords_positive = ["好", "喜欢", "优秀", "满意", "推荐", "赞", "支持", "开心", "棒", "不错"]
sentiment_keywords_negative = ["差", "垃圾", "失望", "糟糕", "反对", "烂", "讨厌", "愤怒", "坑", "后悔"]
sentiment_expr = when(col("content_clean").rlike("|".join(sentiment_keywords_positive)), "positive").when(col("content_clean").rlike("|".join(sentiment_keywords_negative)), "negative").otherwise("neutral")
sentiment_df = cleaned_df.withColumn("sentiment", sentiment_expr).withColumn("content_length", col("content_clean").cast("string").length())
daily_sentiment = sentiment_df.withColumn("date", col("publish_time").cast("date")).groupBy("date", "sentiment").agg(count("*").alias("count"), avg("likes").alias("avg_likes"), avg("comments").alias("avg_comments"), avg("shares").alias("avg_shares"))
sentiment_distribution = sentiment_df.groupBy("sentiment").agg(count("*").alias("total_count"), avg("content_length").alias("avg_length")).orderBy(desc("total_count"))
hourly_activity = sentiment_df.withColumn("hour", hour(col("publish_time"))).groupBy("hour").agg(count("*").alias("post_count"), avg("likes").alias("avg_likes")).orderBy("hour")
top_users = sentiment_df.groupBy("user_id", "user_name").agg(count("*").alias("post_count"), spark_sum("likes").alias("total_likes"), spark_sum("comments").alias("total_comments")).orderBy(desc("total_likes")).limit(20)
window_spec = Window.partitionBy("sentiment").orderBy(desc("likes"))
hot_posts = sentiment_df.withColumn("rank", row_number().over(window_spec)).filter(col("rank") <= 5).select("content", "sentiment", "likes", "comments", "shares", "publish_time")
daily_sentiment_pd = daily_sentiment.toPandas()
sentiment_dist_pd = sentiment_distribution.toPandas()
hourly_activity_pd = hourly_activity.toPandas()
top_users_pd = top_users.toPandas()
hot_posts_pd = hot_posts.toPandas()
daily_pivot = daily_sentiment_pd.pivot(index='date', columns='sentiment', values='count').fillna(0)
daily_pivot['total'] = daily_pivot.sum(axis=1)
daily_pivot['positive_ratio'] = (daily_pivot.get('positive', 0) / daily_pivot['total'] * 100).round(2)
daily_pivot['negative_ratio'] = (daily_pivot.get('negative', 0) / daily_pivot['total'] * 100).round(2)
trend_analysis = daily_pivot.reset_index()
trend_analysis['date'] = pd.to_datetime(trend_analysis['date'])
trend_analysis = trend_analysis.sort_values('date')
trend_analysis['sentiment_score'] = (trend_analysis.get('positive', 0) - trend_analysis.get('negative', 0)) / trend_analysis['total']
trend_analysis['moving_avg_score'] = trend_analysis['sentiment_score'].rolling(window=3, min_periods=1).mean()
result = {"daily_sentiment": daily_sentiment_pd.to_dict('records'), "sentiment_distribution": sentiment_dist_pd.to_dict('records'), "hourly_activity": hourly_activity_pd.to_dict('records'), "top_users": top_users_pd.to_dict('records'), "hot_posts": hot_posts_pd.to_dict('records'), "trend_analysis": trend_analysis.to_dict('records'), "summary": {"total_posts": int(sentiment_df.count()), "positive_count": int(sentiment_df.filter(col("sentiment") == "positive").count()), "negative_count": int(sentiment_df.filter(col("sentiment") == "negative").count()), "neutral_count": int(sentiment_df.filter(col("sentiment") == "neutral").count()), "avg_engagement": float(sentiment_df.agg(avg(col("likes") + col("comments") + col("shares"))).collect()[0][0])}}
return result
def keyword_extraction_and_cooccurrence(hdfs_data_path, time_window_days=7, top_n=30):
raw_df = spark.read.option("header", "true").option("inferSchema", "true").csv(hdfs_data_path)
cutoff_date = datetime.now() - timedelta(days=time_window_days)
filtered_df = raw_df.filter(col("publish_time") >= cutoff_date).withColumn("content_clean", regexp_replace(col("content"), r'[^\u4e00-\u9fa5a-zA-Z0-9\s]', ' ')).withColumn("content_clean", trim(lower(col("content_clean")))).filter(col("content_clean") != "")
content_list = filtered_df.select("content_clean").rdd.flatMap(lambda x: x).collect()
all_words = []
stopwords = {"的", "了", "在", "是", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这"}
for content in content_list:
words = [w.strip() for w in content.split() if len(w.strip()) >= 2 and w.strip() not in stopwords]
all_words.extend(words)
word_freq = Counter(all_words)
top_keywords = word_freq.most_common(top_n)
keyword_df_data = [(word, count) for word, count in top_keywords]
keyword_df = spark.createDataFrame(keyword_df_data, ["keyword", "frequency"])
content_with_keywords = filtered_df.select("content_clean", "likes", "comments")
keyword_performance = []
for keyword, freq in top_keywords:
keyword_posts = content_with_keywords.filter(col("content_clean").contains(keyword))
if keyword_posts.count() > 0:
avg_likes = keyword_posts.agg(avg("likes")).collect()[0][0]
avg_comments = keyword_posts.agg(avg("comments")).collect()[0][0]
keyword_performance.append({"keyword": keyword, "frequency": freq, "avg_likes": float(avg_likes) if avg_likes else 0, "avg_comments": float(avg_comments) if avg_comments else 0, "engagement_score": float((avg_likes or 0) + (avg_comments or 0) * 2)})
keyword_performance_sorted = sorted(keyword_performance, key=lambda x: x['engagement_score'], reverse=True)
cooccurrence_matrix = {}
top_keyword_list = [kw for kw, _ in top_keywords[:15]]
for content in content_list:
words_in_content = set([w.strip() for w in content.split() if w.strip() in top_keyword_list])
for word1 in words_in_content:
if word1 not in cooccurrence_matrix:
cooccurrence_matrix[word1] = {}
for word2 in words_in_content:
if word1 != word2:
cooccurrence_matrix[word1][word2] = cooccurrence_matrix[word1].get(word2, 0) + 1
cooccurrence_pairs = []
processed_pairs = set()
for word1 in cooccurrence_matrix:
for word2, count in cooccurrence_matrix[word1].items():
pair_key = tuple(sorted([word1, word2]))
if pair_key not in processed_pairs:
cooccurrence_pairs.append({"source": word1, "target": word2, "weight": count})
processed_pairs.add(pair_key)
cooccurrence_pairs_sorted = sorted(cooccurrence_pairs, key=lambda x: x['weight'], reverse=True)[:50]
result = {"top_keywords": [{"keyword": kw, "frequency": freq} for kw, freq in top_keywords], "keyword_performance": keyword_performance_sorted, "cooccurrence_network": cooccurrence_pairs_sorted, "time_window": time_window_days, "total_posts_analyzed": len(content_list)}
return result
def user_behavior_clustering_analysis(hdfs_data_path, cluster_count=5):
raw_df = spark.read.option("header", "true").option("inferSchema", "true").csv(hdfs_data_path)
user_features = raw_df.groupBy("user_id", "user_name").agg(count("*").alias("post_count"), spark_sum("likes").alias("total_likes"), spark_sum("comments").alias("total_comments"), spark_sum("shares").alias("total_shares"), avg("likes").alias("avg_likes"), avg("comments").alias("avg_comments"), avg(col("content").cast("string").length()).alias("avg_content_length"))
user_features_filled = user_features.fillna(0)
user_features_pd = user_features_filled.toPandas()
feature_columns = ['post_count', 'total_likes', 'total_comments', 'total_shares', 'avg_likes', 'avg_comments', 'avg_content_length']
X = user_features_pd[feature_columns].values
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=cluster_count, random_state=42, n_init=10)
user_features_pd['cluster'] = kmeans.fit_predict(X_scaled)
cluster_profiles = user_features_pd.groupby('cluster').agg({'post_count': ['mean', 'std'], 'total_likes': ['mean', 'std'], 'total_comments': ['mean', 'std'], 'avg_likes': ['mean', 'std'], 'avg_content_length': ['mean', 'std'], 'user_id': 'count'}).round(2)
cluster_profiles.columns = ['_'.join(col).strip() for col in cluster_profiles.columns.values]
cluster_profiles = cluster_profiles.rename(columns={'user_id_count': 'user_count'})
cluster_profiles_dict = cluster_profiles.reset_index().to_dict('records')
user_cluster_labels = ["高活跃KOL", "普通活跃用户", "低频互动用户", "潜水观察者", "新注册用户"]
for i, profile in enumerate(cluster_profiles_dict):
profile['cluster_label'] = user_cluster_labels[i] if i < len(user_cluster_labels) else f"用户群{i}"
if profile['post_count_mean'] > user_features_pd['post_count'].quantile(0.75) and profile['total_likes_mean'] > user_features_pd['total_likes'].quantile(0.75):
profile['cluster_label'] = "高活跃KOL"
elif profile['post_count_mean'] > user_features_pd['post_count'].median():
profile['cluster_label'] = "普通活跃用户"
elif profile['post_count_mean'] > user_features_pd['post_count'].quantile(0.25):
profile['cluster_label'] = "低频互动用户"
else:
profile['cluster_label'] = "潜水观察者"
top_users_per_cluster = []
for cluster_id in range(cluster_count):
cluster_users = user_features_pd[user_features_pd['cluster'] == cluster_id].nlargest(5, 'total_likes')[['user_id', 'user_name', 'post_count', 'total_likes', 'avg_likes']].to_dict('records')
top_users_per_cluster.append({"cluster_id": int(cluster_id), "cluster_label": cluster_profiles_dict[cluster_id]['cluster_label'], "top_users": cluster_users})
user_distribution = user_features_pd['cluster'].value_counts().sort_index().to_dict()
result = {"cluster_profiles": cluster_profiles_dict, "top_users_per_cluster": top_users_per_cluster, "user_distribution": {int(k): int(v) for k, v in user_distribution.items()}, "total_users_analyzed": int(len(user_features_pd)), "feature_importance": {"post_count": "发帖频率", "total_likes": "总点赞数", "total_comments": "总评论数", "avg_likes": "平均点赞数", "avg_content_length": "平均内容长度"}}
return result
基于大数据的社交媒体舆情数据可视化分析系统文档展示
💖💖作者:计算机毕业设计杰瑞 💙💙个人简介:曾长期从事计算机专业培训教学,本人也热爱上课教学,语言擅长Java、微信小程序、Python、Golang、安卓Android等,开发项目包括大数据、深度学习、网站、小程序、安卓、算法。平常会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 深度学校实战项目 计算机毕业设计选题推荐