【大数据】旅游上榜景点及评论数据可视化分析系统计算机毕业设计项目 Hadoop+Spark环境配置数据科学与大数据技术附源码+文档+讲解

前言

💖💖作者：计算机程序员小杨 💙💙个人简介：我是一名计算机相关专业的从业者，擅长Java、微信小程序、Python、Golang、安卓Android等多个IT方向。会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。热爱技术，喜欢钻研新工具和框架，也乐于通过代码解决实际问题，大家有技术代码这一块的问题可以问我！ 💛💛想说的话：感谢大家的关注与支持！ 💕💕文末获取源码联系计算机程序员小杨 💜💜 网站实战项目安卓/小程序实战项目大数据实战项目深度学习实战项目计算机毕业设计选题 💜💜

一.开发工具简介

大数据框架：Hadoop+Spark（本次没用Hive，支持定制）开发语言：Python+Java（两个版本都支持）后端框架：Django+Spring Boot(Spring+SpringMVC+Mybatis)（两个版本都支持）前端：Vue+ElementUI+Echarts+HTML+CSS+JavaScript+jQuery 详细技术点：Hadoop、HDFS、Spark、Spark SQL、Pandas、NumPy 数据库：MySQL

二.系统内容简介

本旅游上榜景点及评论数据可视化分析系统基于Hadoop+Spark大数据框架构建，采用Python作为核心开发语言，后端使用Django框架提供RESTful API接口，前端通过Vue+ElementUI+Echarts实现交互式数据可视化界面。系统利用HDFS存储海量旅游景点及用户评论数据，借助Spark SQL进行分布式数据处理与查询，结合Pandas和NumPy进行数据清洗、统计分析与特征提取。系统功能涵盖系统管理、热门评分分析、价格维度分析、游客行为分析、景点特征分析、地域分布分析以及可视化大屏展示七大模块，能够从多维度对旅游景点数据进行深度挖掘，通过评分趋势、价格区间分布、游客访问时段、景点类型偏好、地理位置热力图等可视化图表，为旅游从业者提供数据决策支持，帮助游客快速筛选适合的旅游目的地，实现旅游数据的智能化分析与精准化推荐。

三.系统功能演示

旅游上榜景点及评论数据可视化分析系统

四.系统界面展示

在这里插入图片描述

五.系统源码展示


from pyspark.sql import SparkSession
from pyspark.sql.functions import col,avg,count,sum,desc,when,hour,dayofweek,month,regexp_replace,trim,lower
from pyspark.sql.types import FloatType,IntegerType
import pandas as pd
import numpy as np
from django.http import JsonResponse
from django.views.decorators.http import require_http_methods
import json
spark=SparkSession.builder.appName("TourismAnalysis").config("spark.sql.shuffle.partitions","4").config("spark.driver.memory","2g").getOrCreate()
@require_http_methods(["GET"])
def hot_rating_analysis(request):
    hdfs_path="hdfs://localhost:9000/tourism/attractions.csv"
    df=spark.read.csv(hdfs_path,header=True,inferSchema=True)
    df_cleaned=df.filter(col("rating").isNotNull()&col("attraction_name").isNotNull())
    df_cleaned=df_cleaned.withColumn("rating",col("rating").cast(FloatType()))
    df_cleaned=df_cleaned.withColumn("review_count",col("review_count").cast(IntegerType()))
    rating_stats=df_cleaned.groupBy("attraction_name").agg(avg("rating").alias("avg_rating"),count("review_count").alias("total_reviews"),sum("view_count").alias("total_views")).filter(col("total_reviews")>=50)
    rating_stats=rating_stats.withColumn("popularity_score",col("avg_rating")*0.4+col("total_reviews")*0.00001+col("total_views")*0.000005)
    top_attractions=rating_stats.orderBy(desc("popularity_score")).limit(20)
    result_pd=top_attractions.toPandas()
    rating_distribution=df_cleaned.withColumn("rating_level",when(col("rating")>=4.5,"优秀").when(col("rating")>=4.0,"良好").when(col("rating")>=3.5,"一般").otherwise("较差")).groupBy("rating_level").agg(count("*").alias("count"))
    rating_dist_pd=rating_distribution.toPandas()
    monthly_trend=df_cleaned.withColumn("month",month(col("review_date"))).groupBy("month").agg(avg("rating").alias("avg_rating"),count("*").alias("review_count")).orderBy("month")
    monthly_pd=monthly_trend.toPandas()
    response_data={"top_attractions":result_pd.to_dict(orient='records'),"rating_distribution":rating_dist_pd.to_dict(orient='records'),"monthly_trend":monthly_pd.to_dict(orient='records'),"total_attractions":df_cleaned.select("attraction_name").distinct().count()}
    return JsonResponse(response_data,safe=False)
@require_http_methods(["GET"])
def price_dimension_analysis(request):
    price_min=request.GET.get('price_min',0)
    price_max=request.GET.get('price_max',1000)
    hdfs_path="hdfs://localhost:9000/tourism/attractions.csv"
    df=spark.read.csv(hdfs_path,header=True,inferSchema=True)
    df_price=df.filter((col("ticket_price").isNotNull())&(col("ticket_price")>=int(price_min))&(col("ticket_price")<=int(price_max)))
    df_price=df_price.withColumn("ticket_price",col("ticket_price").cast(FloatType()))
    price_ranges=df_price.withColumn("price_range",when(col("ticket_price")==0,"免费").when(col("ticket_price")<50,"50元以下").when(col("ticket_price")<100,"50-100元").when(col("ticket_price")<200,"100-200元").otherwise("200元以上")).groupBy("price_range").agg(count("*").alias("count"),avg("rating").alias("avg_rating"))
    price_range_pd=price_ranges.toPandas()
    price_rating_corr=df_price.select("ticket_price","rating").toPandas()
    correlation=price_rating_corr['ticket_price'].corr(price_rating_corr['rating'])
    scenic_type_price=df_price.groupBy("scenic_type").agg(avg("ticket_price").alias("avg_price"),count("*").alias("count"),avg("rating").alias("avg_rating")).filter(col("count")>=5).orderBy(desc("avg_price"))
    type_price_pd=scenic_type_price.toPandas()
    price_percentiles=df_price.select("ticket_price").summary("25%","50%","75%").toPandas()
    cost_performance=df_price.withColumn("cp_ratio",col("rating")/when(col("ticket_price")==0,1).otherwise(col("ticket_price"))*100).orderBy(desc("cp_ratio")).limit(15)
    cp_pd=cost_performance.toPandas()
    response_data={"price_distribution":price_range_pd.to_dict(orient='records'),"price_rating_correlation":float(correlation),"type_price_analysis":type_price_pd.to_dict(orient='records'),"price_percentiles":price_percentiles.to_dict(orient='records'),"cost_performance_top":cp_pd.to_dict(orient='records')}
    return JsonResponse(response_data,safe=False)
@require_http_methods(["GET"])
def tourist_behavior_analysis(request):
    start_date=request.GET.get('start_date','2023-01-01')
    end_date=request.GET.get('end_date','2024-12-31')
    hdfs_path="hdfs://localhost:9000/tourism/reviews.csv"
    df=spark.read.csv(hdfs_path,header=True,inferSchema=True)
    df_behavior=df.filter((col("review_date")>=start_date)&(col("review_date")<=end_date)&col("user_id").isNotNull())
    df_behavior=df_behavior.withColumn("review_date",col("review_date").cast("timestamp"))
    hourly_pattern=df_behavior.withColumn("hour",hour("review_date")).groupBy("hour").agg(count("*").alias("review_count")).orderBy("hour")
    hourly_pd=hourly_pattern.toPandas()
    weekly_pattern=df_behavior.withColumn("weekday",dayofweek("review_date")).groupBy("weekday").agg(count("*").alias("review_count"),avg("rating").alias("avg_rating")).orderBy("weekday")
    weekly_pd=weekly_pattern.toPandas()
    user_activity=df_behavior.groupBy("user_id").agg(count("*").alias("review_count"),avg("rating").alias("avg_rating")).withColumn("user_type",when(col("review_count")>=10,"活跃用户").when(col("review_count")>=5,"普通用户").otherwise("新用户"))
    user_type_dist=user_activity.groupBy("user_type").agg(count("*").alias("user_count"))
    user_type_pd=user_type_dist.toPandas()
    df_behavior=df_behavior.withColumn("review_length",when(col("review_content").isNotNull(),col("review_content")).otherwise(""))
    sentiment_keywords=df_behavior.withColumn("has_positive",when(col("review_content").rlike("好|棒|推荐|值得|美|漂亮"),1).otherwise(0)).withColumn("has_negative",when(col("review_content").rlike("差|不好|坑|贵|失望|糟糕"),1).otherwise(0))
    sentiment_stats=sentiment_keywords.agg(sum("has_positive").alias("positive_count"),sum("has_negative").alias("negative_count"))
    sentiment_pd=sentiment_stats.toPandas()
    peak_months=df_behavior.withColumn("month",month("review_date")).groupBy("month").agg(count("*").alias("visit_count")).orderBy(desc("visit_count")).limit(6)
    peak_pd=peak_months.toPandas()
    response_data={"hourly_pattern":hourly_pd.to_dict(orient='records'),"weekly_pattern":weekly_pd.to_dict(orient='records'),"user_type_distribution":user_type_pd.to_dict(orient='records'),"sentiment_analysis":sentiment_pd.to_dict(orient='records'),"peak_months":peak_pd.to_dict(orient='records')}
    return JsonResponse(response_data,safe=False)

六.系统文档展示