前言
💖💖作者:计算机程序员小杨 💙💙个人简介:我是一名计算机相关专业的从业者,擅长Java、微信小程序、Python、Golang、安卓Android等多个IT方向。会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。热爱技术,喜欢钻研新工具和框架,也乐于通过代码解决实际问题,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💕💕文末获取源码联系 计算机程序员小杨 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 深度学习实战项目 计算机毕业设计选题 💜💜
一.开发工具简介
大数据框架:Hadoop+Spark(本次没用Hive,支持定制) 开发语言:Python+Java(两个版本都支持) 后端框架:Django+Spring Boot(Spring+SpringMVC+Mybatis)(两个版本都支持) 前端:Vue+ElementUI+Echarts+HTML+CSS+JavaScript+jQuery 详细技术点:Hadoop、HDFS、Spark、Spark SQL、Pandas、NumPy 数据库:MySQL
二.系统内容简介
本旅游上榜景点及评论数据可视化分析系统基于Hadoop+Spark大数据框架构建,采用Python作为核心开发语言,后端使用Django框架提供RESTful API接口,前端通过Vue+ElementUI+Echarts实现交互式数据可视化界面。系统利用HDFS存储海量旅游景点及用户评论数据,借助Spark SQL进行分布式数据处理与查询,结合Pandas和NumPy进行数据清洗、统计分析与特征提取。系统功能涵盖系统管理、热门评分分析、价格维度分析、游客行为分析、景点特征分析、地域分布分析以及可视化大屏展示七大模块,能够从多维度对旅游景点数据进行深度挖掘,通过评分趋势、价格区间分布、游客访问时段、景点类型偏好、地理位置热力图等可视化图表,为旅游从业者提供数据决策支持,帮助游客快速筛选适合的旅游目的地,实现旅游数据的智能化分析与精准化推荐。
三.系统功能演示
四.系统界面展示
五.系统源码展示
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,avg,count,sum,desc,when,hour,dayofweek,month,regexp_replace,trim,lower
from pyspark.sql.types import FloatType,IntegerType
import pandas as pd
import numpy as np
from django.http import JsonResponse
from django.views.decorators.http import require_http_methods
import json
spark=SparkSession.builder.appName("TourismAnalysis").config("spark.sql.shuffle.partitions","4").config("spark.driver.memory","2g").getOrCreate()
@require_http_methods(["GET"])
def hot_rating_analysis(request):
hdfs_path="hdfs://localhost:9000/tourism/attractions.csv"
df=spark.read.csv(hdfs_path,header=True,inferSchema=True)
df_cleaned=df.filter(col("rating").isNotNull()&col("attraction_name").isNotNull())
df_cleaned=df_cleaned.withColumn("rating",col("rating").cast(FloatType()))
df_cleaned=df_cleaned.withColumn("review_count",col("review_count").cast(IntegerType()))
rating_stats=df_cleaned.groupBy("attraction_name").agg(avg("rating").alias("avg_rating"),count("review_count").alias("total_reviews"),sum("view_count").alias("total_views")).filter(col("total_reviews")>=50)
rating_stats=rating_stats.withColumn("popularity_score",col("avg_rating")*0.4+col("total_reviews")*0.00001+col("total_views")*0.000005)
top_attractions=rating_stats.orderBy(desc("popularity_score")).limit(20)
result_pd=top_attractions.toPandas()
rating_distribution=df_cleaned.withColumn("rating_level",when(col("rating")>=4.5,"优秀").when(col("rating")>=4.0,"良好").when(col("rating")>=3.5,"一般").otherwise("较差")).groupBy("rating_level").agg(count("*").alias("count"))
rating_dist_pd=rating_distribution.toPandas()
monthly_trend=df_cleaned.withColumn("month",month(col("review_date"))).groupBy("month").agg(avg("rating").alias("avg_rating"),count("*").alias("review_count")).orderBy("month")
monthly_pd=monthly_trend.toPandas()
response_data={"top_attractions":result_pd.to_dict(orient='records'),"rating_distribution":rating_dist_pd.to_dict(orient='records'),"monthly_trend":monthly_pd.to_dict(orient='records'),"total_attractions":df_cleaned.select("attraction_name").distinct().count()}
return JsonResponse(response_data,safe=False)
@require_http_methods(["GET"])
def price_dimension_analysis(request):
price_min=request.GET.get('price_min',0)
price_max=request.GET.get('price_max',1000)
hdfs_path="hdfs://localhost:9000/tourism/attractions.csv"
df=spark.read.csv(hdfs_path,header=True,inferSchema=True)
df_price=df.filter((col("ticket_price").isNotNull())&(col("ticket_price")>=int(price_min))&(col("ticket_price")<=int(price_max)))
df_price=df_price.withColumn("ticket_price",col("ticket_price").cast(FloatType()))
price_ranges=df_price.withColumn("price_range",when(col("ticket_price")==0,"免费").when(col("ticket_price")<50,"50元以下").when(col("ticket_price")<100,"50-100元").when(col("ticket_price")<200,"100-200元").otherwise("200元以上")).groupBy("price_range").agg(count("*").alias("count"),avg("rating").alias("avg_rating"))
price_range_pd=price_ranges.toPandas()
price_rating_corr=df_price.select("ticket_price","rating").toPandas()
correlation=price_rating_corr['ticket_price'].corr(price_rating_corr['rating'])
scenic_type_price=df_price.groupBy("scenic_type").agg(avg("ticket_price").alias("avg_price"),count("*").alias("count"),avg("rating").alias("avg_rating")).filter(col("count")>=5).orderBy(desc("avg_price"))
type_price_pd=scenic_type_price.toPandas()
price_percentiles=df_price.select("ticket_price").summary("25%","50%","75%").toPandas()
cost_performance=df_price.withColumn("cp_ratio",col("rating")/when(col("ticket_price")==0,1).otherwise(col("ticket_price"))*100).orderBy(desc("cp_ratio")).limit(15)
cp_pd=cost_performance.toPandas()
response_data={"price_distribution":price_range_pd.to_dict(orient='records'),"price_rating_correlation":float(correlation),"type_price_analysis":type_price_pd.to_dict(orient='records'),"price_percentiles":price_percentiles.to_dict(orient='records'),"cost_performance_top":cp_pd.to_dict(orient='records')}
return JsonResponse(response_data,safe=False)
@require_http_methods(["GET"])
def tourist_behavior_analysis(request):
start_date=request.GET.get('start_date','2023-01-01')
end_date=request.GET.get('end_date','2024-12-31')
hdfs_path="hdfs://localhost:9000/tourism/reviews.csv"
df=spark.read.csv(hdfs_path,header=True,inferSchema=True)
df_behavior=df.filter((col("review_date")>=start_date)&(col("review_date")<=end_date)&col("user_id").isNotNull())
df_behavior=df_behavior.withColumn("review_date",col("review_date").cast("timestamp"))
hourly_pattern=df_behavior.withColumn("hour",hour("review_date")).groupBy("hour").agg(count("*").alias("review_count")).orderBy("hour")
hourly_pd=hourly_pattern.toPandas()
weekly_pattern=df_behavior.withColumn("weekday",dayofweek("review_date")).groupBy("weekday").agg(count("*").alias("review_count"),avg("rating").alias("avg_rating")).orderBy("weekday")
weekly_pd=weekly_pattern.toPandas()
user_activity=df_behavior.groupBy("user_id").agg(count("*").alias("review_count"),avg("rating").alias("avg_rating")).withColumn("user_type",when(col("review_count")>=10,"活跃用户").when(col("review_count")>=5,"普通用户").otherwise("新用户"))
user_type_dist=user_activity.groupBy("user_type").agg(count("*").alias("user_count"))
user_type_pd=user_type_dist.toPandas()
df_behavior=df_behavior.withColumn("review_length",when(col("review_content").isNotNull(),col("review_content")).otherwise(""))
sentiment_keywords=df_behavior.withColumn("has_positive",when(col("review_content").rlike("好|棒|推荐|值得|美|漂亮"),1).otherwise(0)).withColumn("has_negative",when(col("review_content").rlike("差|不好|坑|贵|失望|糟糕"),1).otherwise(0))
sentiment_stats=sentiment_keywords.agg(sum("has_positive").alias("positive_count"),sum("has_negative").alias("negative_count"))
sentiment_pd=sentiment_stats.toPandas()
peak_months=df_behavior.withColumn("month",month("review_date")).groupBy("month").agg(count("*").alias("visit_count")).orderBy(desc("visit_count")).limit(6)
peak_pd=peak_months.toPandas()
response_data={"hourly_pattern":hourly_pd.to_dict(orient='records'),"weekly_pattern":weekly_pd.to_dict(orient='records'),"user_type_distribution":user_type_pd.to_dict(orient='records'),"sentiment_analysis":sentiment_pd.to_dict(orient='records'),"peak_months":peak_pd.to_dict(orient='records')}
return JsonResponse(response_data,safe=False)
六.系统文档展示
结束
💕💕文末获取源码联系 计算机程序员小杨