一、个人简介
💖💖作者:计算机编程果茶熊 💙💙个人简介:曾长期从事计算机专业培训教学,担任过编程老师,同时本人也热爱上课教学,擅长Java、微信小程序、Python、Golang、安卓Android等多个IT方向。会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 计算机毕业设计选题 💕💕文末获取源码联系计算机编程果茶熊
二、系统介绍
大数据框架:Hadoop+Spark(Hive需要定制修改) 开发语言:Java+Python(两个版本都支持) 数据库:MySQL 后端框架:SpringBoot(Spring+SpringMVC+Mybatis)+Django(两个版本都支持) 前端:Vue+Echarts+HTML+CSS+JavaScript+jQuery
三、视频解说
四、部分功能展示
五、部分代码展示
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, sum, desc, when, regexp_extract, split, explode
from pyspark.ml.feature import Tokenizer, StopWordsRemover
import pandas as pd
import numpy as np
from textblob import TextBlob
import jieba
import re
from collections import Counter
import json
spark = SparkSession.builder.appName("SocialMediaSentimentAnalysis").config("spark.sql.adaptive.enabled", "true").config("spark.sql.adaptive.coalescePartitions.enabled", "true").getOrCreate()
def sentiment_trend_analysis(data_path, start_date, end_date):
df = spark.read.parquet(data_path)
filtered_df = df.filter((col("publish_time") >= start_date) & (col("publish_time") <= end_date))
def calculate_sentiment_score(text):
if not text:
return 0.0
clean_text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s]', '', text)
positive_words = ['好', '棒', '赞', '优秀', '满意', '开心', '喜欢', '支持', '感谢']
negative_words = ['差', '烂', '垃圾', '失望', '愤怒', '讨厌', '反对', '批评', '投诉']
words = jieba.lcut(clean_text)
positive_count = sum(1 for word in words if word in positive_words)
negative_count = sum(1 for word in words if word in negative_words)
total_sentiment_words = positive_count + negative_count
if total_sentiment_words == 0:
return 0.0
sentiment_score = (positive_count - negative_count) / len(words)
return max(-1.0, min(1.0, sentiment_score))
sentiment_udf = spark.udf.register("calculate_sentiment", calculate_sentiment_score)
sentiment_df = filtered_df.withColumn("sentiment_score", sentiment_udf(col("content")))
sentiment_df = sentiment_df.withColumn("sentiment_label", when(col("sentiment_score") > 0.1, "positive").when(col("sentiment_score") < -0.1, "negative").otherwise("neutral"))
daily_sentiment = sentiment_df.groupBy("publish_date").agg(avg("sentiment_score").alias("avg_sentiment"), count("*").alias("total_posts"), sum(when(col("sentiment_label") == "positive", 1).otherwise(0)).alias("positive_count"), sum(when(col("sentiment_label") == "negative", 1).otherwise(0)).alias("negative_count"), sum(when(col("sentiment_label") == "neutral", 1).otherwise(0)).alias("neutral_count")).orderBy("publish_date")
trend_data = daily_sentiment.collect()
result = []
for row in trend_data:
date_str = row["publish_date"]
sentiment_distribution = {"positive": row["positive_count"], "negative": row["negative_count"], "neutral": row["neutral_count"]}
trend_point = {"date": date_str, "avg_sentiment": round(row["avg_sentiment"], 3), "total_posts": row["total_posts"], "sentiment_distribution": sentiment_distribution}
result.append(trend_point)
return {"trend_data": result, "analysis_period": f"{start_date} to {end_date}"}
def content_profile_analysis(data_path, keyword_filter=None):
df = spark.read.parquet(data_path)
if keyword_filter:
df = df.filter(col("content").contains(keyword_filter))
tokenizer = Tokenizer(inputCol="content", outputCol="words")
tokenized_df = tokenizer.transform(df)
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered_df = stopwords_remover.transform(tokenized_df)
word_counts = filtered_df.select(explode(col("filtered_words")).alias("word")).groupBy("word").count().filter(col("count") > 10).orderBy(desc("count"))
top_keywords = word_counts.limit(100).collect()
keyword_list = [{"word": row["word"], "frequency": row["count"]} for row in top_keywords]
content_length_stats = df.select((col("content")).alias("content_length")).agg(avg("content_length").alias("avg_length"), count("*").alias("total_content")).collect()[0]
hashtag_pattern = r'#([^#\s]+)#'
hashtag_df = df.select(regexp_extract(col("content"), hashtag_pattern, 1).alias("hashtag")).filter(col("hashtag") != "").groupBy("hashtag").count().orderBy(desc("count"))
top_hashtags = hashtag_df.limit(20).collect()
hashtag_list = [{"hashtag": row["hashtag"], "count": row["count"]} for row in top_hashtags]
mention_pattern = r'@([a-zA-Z0-9_\u4e00-\u9fa5]+)'
mention_df = df.select(regexp_extract(col("content"), mention_pattern, 1).alias("mentioned_user")).filter(col("mentioned_user") != "").groupBy("mentioned_user").count().orderBy(desc("count"))
top_mentions = mention_df.limit(20).collect()
mention_list = [{"user": row["mentioned_user"], "mention_count": row["count"]} for row in top_mentions]
platform_distribution = df.groupBy("platform").count().orderBy(desc("count")).collect()
platform_stats = [{"platform": row["platform"], "post_count": row["count"]} for row in platform_distribution]
content_type_distribution = df.groupBy("content_type").count().collect()
content_type_stats = [{"type": row["content_type"], "count": row["count"]} for row in content_type_distribution]
return {"keywords": keyword_list, "content_stats": {"avg_length": round(content_length_stats["avg_length"], 2), "total_posts": content_length_stats["total_content"]}, "hashtags": hashtag_list, "mentions": mention_list, "platform_distribution": platform_stats, "content_type_distribution": content_type_stats}
def user_interaction_analysis(data_path, time_range_days=30):
df = spark.read.parquet(data_path)
recent_df = df.filter(col("publish_time") >= spark.sql(f"SELECT date_sub(current_date(), {time_range_days})").collect()[0][0])
user_activity = recent_df.groupBy("user_id", "username").agg(count("*").alias("post_count"), sum("like_count").alias("total_likes"), sum("comment_count").alias("total_comments"), sum("share_count").alias("total_shares"), avg("like_count").alias("avg_likes_per_post"), avg("comment_count").alias("avg_comments_per_post")).orderBy(desc("post_count"))
top_active_users = user_activity.limit(50).collect()
active_user_list = []
for row in top_active_users:
user_engagement = (row["total_likes"] + row["total_comments"] + row["total_shares"]) / max(row["post_count"], 1)
user_info = {"user_id": row["user_id"], "username": row["username"], "post_count": row["post_count"], "total_engagement": row["total_likes"] + row["total_comments"] + row["total_shares"], "engagement_rate": round(user_engagement, 2), "avg_likes": round(row["avg_likes_per_post"], 2), "avg_comments": round(row["avg_comments_per_post"], 2)}
active_user_list.append(user_info)
interaction_trends = recent_df.groupBy("publish_date").agg(sum("like_count").alias("daily_likes"), sum("comment_count").alias("daily_comments"), sum("share_count").alias("daily_shares"), count("*").alias("daily_posts")).orderBy("publish_date")
trend_data = interaction_trends.collect()
daily_trends = []
for row in trend_data:
daily_engagement = row["daily_likes"] + row["daily_comments"] + row["daily_shares"]
trend_point = {"date": row["publish_date"], "likes": row["daily_likes"], "comments": row["daily_comments"], "shares": row["daily_shares"], "posts": row["daily_posts"], "total_engagement": daily_engagement, "engagement_per_post": round(daily_engagement / max(row["daily_posts"], 1), 2)}
daily_trends.append(trend_point)
user_influence_score = recent_df.groupBy("user_id").agg((sum("like_count") * 1 + sum("comment_count") * 2 + sum("share_count") * 3).alias("influence_score"), count("*").alias("post_frequency")).orderBy(desc("influence_score"))
top_influencers = user_influence_score.limit(20).collect()
influencer_list = [{"user_id": row["user_id"], "influence_score": row["influence_score"], "post_frequency": row["post_frequency"]} for row in top_influencers]
user_type_distribution = recent_df.groupBy("user_type").count().collect()
user_type_stats = [{"user_type": row["user_type"], "count": row["count"]} for row in user_type_distribution]
return {"active_users": active_user_list, "daily_trends": daily_trends, "top_influencers": influencer_list, "user_type_distribution": user_type_stats, "analysis_timeframe": f"Last {time_range_days} days"}
六、部分文档展示
七、END
💕💕文末获取源码联系计算机编程果茶熊