一、个人简介
💖💖作者:计算机编程果茶熊 💙💙个人简介:曾长期从事计算机专业培训教学,担任过编程老师,同时本人也热爱上课教学,擅长Java、微信小程序、Python、Golang、安卓Android等多个IT方向。会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 计算机毕业设计选题 💕💕文末获取源码联系计算机编程果茶熊
二、系统介绍
大数据框架:Hadoop+Spark(Hive需要定制修改) 开发语言:Java+Python(两个版本都支持) 数据库:MySQL 后端框架:SpringBoot(Spring+SpringMVC+Mybatis)+Django(两个版本都支持) 前端:Vue+Echarts+HTML+CSS+JavaScript+jQuery
三、视频解说
四、部分功能展示
五、部分代码展示
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
import pandas as pd
import numpy as np
from django.http import JsonResponse
from django.views.decorators.csrf import csrf_exempt
import json
from collections import Counter
import re
spark = SparkSession.builder.appName("TourismAnalysis").config("spark.sql.adaptive.enabled", "true").config("spark.sql.adaptive.coalescePartitions.enabled", "true").getOrCreate()
@csrf_exempt
def overall_score_analysis(request):
df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/tourism").option("dbtable", "reviews").option("user", "root").option("password", "password").load()
score_stats = df.groupBy("scenic_spot").agg(avg("rating").alias("avg_rating"), count("rating").alias("review_count"), stddev("rating").alias("rating_std")).collect()
score_distribution = df.groupBy("scenic_spot", "rating").count().orderBy("scenic_spot", "rating").collect()
monthly_trends = df.withColumn("month", date_format("review_date", "yyyy-MM")).groupBy("scenic_spot", "month").agg(avg("rating").alias("monthly_avg")).orderBy("scenic_spot", "month").collect()
sentiment_mapping = df.withColumn("sentiment", when(col("rating") >= 4, "positive").when(col("rating") <= 2, "negative").otherwise("neutral")).groupBy("scenic_spot", "sentiment").count().collect()
score_ranges = df.withColumn("score_range", when(col("rating") >= 4.5, "excellent").when(col("rating") >= 3.5, "good").when(col("rating") >= 2.5, "fair").otherwise("poor")).groupBy("scenic_spot", "score_range").count().collect()
comparative_analysis = df.groupBy("scenic_spot").agg(avg("rating").alias("avg_rating"), min("rating").alias("min_rating"), max("rating").alias("max_rating"), approx_count_distinct("user_id").alias("unique_users")).collect()
quality_indicators = df.filter(col("comment_length") > 50).groupBy("scenic_spot").agg(avg("rating").alias("detailed_avg"), count("*").alias("detailed_count")).collect()
seasonal_patterns = df.withColumn("season", when(month("review_date").isin([12, 1, 2]), "winter").when(month("review_date").isin([3, 4, 5]), "spring").when(month("review_date").isin([6, 7, 8]), "summer").otherwise("autumn")).groupBy("scenic_spot", "season").agg(avg("rating").alias("seasonal_avg")).collect()
rating_volatility = df.withColumn("date_rank", row_number().over(Window.partitionBy("scenic_spot").orderBy("review_date"))).withColumn("prev_rating", lag("rating").over(Window.partitionBy("scenic_spot").orderBy("review_date"))).withColumn("rating_change", abs(col("rating") - col("prev_rating"))).groupBy("scenic_spot").agg(avg("rating_change").alias("volatility")).collect()
result_data = {"score_stats": [{"spot": row.scenic_spot, "avg": float(row.avg_rating), "count": row.review_count, "std": float(row.rating_std or 0)} for row in score_stats], "distribution": [{"spot": row.scenic_spot, "rating": row.rating, "count": row.count} for row in score_distribution], "trends": [{"spot": row.scenic_spot, "month": row.month, "avg": float(row.monthly_avg)} for row in monthly_trends], "sentiment": [{"spot": row.scenic_spot, "type": row.sentiment, "count": row.count} for row in sentiment_mapping]}
return JsonResponse(result_data, safe=False)
@csrf_exempt
def tourist_type_analysis(request):
df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/tourism").option("dbtable", "reviews").option("user", "root").option("password", "password").load()
user_features = df.groupBy("user_id").agg(avg("rating").alias("avg_rating"), count("*").alias("review_count"), countDistinct("scenic_spot").alias("spots_visited"), avg("comment_length").alias("avg_comment_length"), min("review_date").alias("first_review"), max("review_date").alias("last_review")).withColumn("activity_span", datediff("last_review", "first_review"))
feature_cols = ["avg_rating", "review_count", "spots_visited", "avg_comment_length", "activity_span"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
user_vectors = assembler.transform(user_features.na.fill(0))
kmeans = KMeans(k=4, seed=42, featuresCol="features", predictionCol="cluster")
model = kmeans.fit(user_vectors)
clustered_users = model.transform(user_vectors)
cluster_profiles = clustered_users.groupBy("cluster").agg(avg("avg_rating").alias("cluster_avg_rating"), avg("review_count").alias("cluster_avg_reviews"), avg("spots_visited").alias("cluster_avg_spots"), avg("avg_comment_length").alias("cluster_avg_length"), count("*").alias("cluster_size")).collect()
user_behavior_patterns = df.join(clustered_users.select("user_id", "cluster"), "user_id").groupBy("cluster", "scenic_spot").count().orderBy("cluster", desc("count")).collect()
time_patterns = df.join(clustered_users.select("user_id", "cluster"), "user_id").withColumn("hour", hour("review_date")).groupBy("cluster", "hour").count().collect()
rating_patterns = df.join(clustered_users.select("user_id", "cluster"), "user_id").groupBy("cluster", "rating").count().collect()
geographic_patterns = df.join(clustered_users.select("user_id", "cluster"), "user_id").filter(col("user_location").isNotNull()).groupBy("cluster", "user_location").count().orderBy("cluster", desc("count")).collect()
loyalty_analysis = clustered_users.withColumn("loyalty_score", col("review_count") * 0.4 + col("spots_visited") * 0.3 + col("activity_span") * 0.3).groupBy("cluster").agg(avg("loyalty_score").alias("avg_loyalty")).collect()
engagement_levels = clustered_users.withColumn("engagement", when(col("avg_comment_length") > 100, "high").when(col("avg_comment_length") > 50, "medium").otherwise("low")).groupBy("cluster", "engagement").count().collect()
seasonal_preferences = df.join(clustered_users.select("user_id", "cluster"), "user_id").withColumn("season", when(month("review_date").isin([12, 1, 2]), "winter").when(month("review_date").isin([3, 4, 5]), "spring").when(month("review_date").isin([6, 7, 8]), "summer").otherwise("autumn")).groupBy("cluster", "season").count().collect()
result_data = {"profiles": [{"cluster": row.cluster, "avg_rating": float(row.cluster_avg_rating), "avg_reviews": float(row.cluster_avg_reviews), "avg_spots": float(row.cluster_avg_spots), "size": row.cluster_size} for row in cluster_profiles], "behaviors": [{"cluster": row.cluster, "spot": row.scenic_spot, "visits": row.count} for row in user_behavior_patterns], "patterns": [{"cluster": row.cluster, "hour": row.hour, "activity": row.count} for row in time_patterns]}
return JsonResponse(result_data, safe=False)
@csrf_exempt
def text_content_analysis(request):
df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/tourism").option("dbtable", "reviews").option("user", "root").option("password", "password").load()
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")
tokenized_df = tokenizer.transform(df)
stop_words = ["的", "了", "在", "是", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这"]
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words", stopWords=stop_words)
filtered_df = remover.transform(tokenized_df)
word_freq = filtered_df.select(explode("filtered_words").alias("word")).groupBy("word").count().orderBy(desc("count")).limit(100).collect()
sentiment_keywords = {"positive": ["好", "棒", "美", "漂亮", "推荐", "值得", "喜欢", "满意", "不错", "赞"], "negative": ["差", "烂", "失望", "糟糕", "不好", "坑", "贵", "脏", "乱", "骗"]}
sentiment_analysis = df.withColumn("positive_score", sum([when(col("comment_text").contains(word), 1).otherwise(0) for word in sentiment_keywords["positive"]])).withColumn("negative_score", sum([when(col("comment_text").contains(word), 1).otherwise(0) for word in sentiment_keywords["negative"]])).withColumn("sentiment_label", when(col("positive_score") > col("negative_score"), "positive").when(col("negative_score") > col("positive_score"), "negative").otherwise("neutral"))
spot_sentiment = sentiment_analysis.groupBy("scenic_spot", "sentiment_label").count().collect()
keyword_by_spot = filtered_df.select("scenic_spot", explode("filtered_words").alias("word")).filter(length("word") > 1).groupBy("scenic_spot", "word").count().orderBy("scenic_spot", desc("count")).collect()
comment_length_analysis = df.withColumn("length_category", when(col("comment_length") < 20, "short").when(col("comment_length") < 100, "medium").otherwise("long")).groupBy("scenic_spot", "length_category").agg(count("*").alias("count"), avg("rating").alias("avg_rating")).collect()
topic_extraction = filtered_df.select("scenic_spot", "filtered_words").rdd.map(lambda row: (row.scenic_spot, row.filtered_words)).groupByKey().map(lambda x: (x[0], [word for words in x[1] for word in words])).map(lambda x: (x[0], Counter(x[1]).most_common(10))).collect()
time_sentiment_trend = sentiment_analysis.withColumn("month", date_format("review_date", "yyyy-MM")).groupBy("scenic_spot", "month", "sentiment_label").count().orderBy("scenic_spot", "month").collect()
user_sentiment_consistency = sentiment_analysis.groupBy("user_id").agg(countDistinct("sentiment_label").alias("sentiment_variety"), mode("sentiment_label").alias("dominant_sentiment")).collect()
rating_text_correlation = df.withColumn("text_sentiment", when(col("comment_text").rlike("好|棒|美|推荐|值得"), 1).when(col("comment_text").rlike("差|烂|失望|不好"), -1).otherwise(0)).groupBy("scenic_spot").agg(corr("rating", "text_sentiment").alias("correlation")).collect()
result_data = {"word_frequency": [{"word": row.word, "count": row.count} for row in word_freq], "sentiment_distribution": [{"spot": row.scenic_spot, "sentiment": row.sentiment_label, "count": row.count} for row in spot_sentiment], "spot_keywords": [{"spot": row.scenic_spot, "word": row.word, "frequency": row.count} for row in keyword_by_spot[:200]], "topics": [{"spot": spot, "keywords": keywords} for spot, keywords in topic_extraction]}
return JsonResponse(result_data, safe=False)
六、部分文档展示
七、END
💕💕文末获取源码联系计算机编程果茶熊