酒店评论文本情感分析研究 | 【大数据毕设项目】大数据选题推荐 大数据项目 选题+定制+部署+教程+ppt Hadoop SPark Django java

48 阅读5分钟

💖💖作者:计算机毕业设计杰瑞 💙💙个人简介:曾长期从事计算机专业培训教学,本人也热爱上课教学,语言擅长Java、微信小程序、Python、Golang、安卓Android等,开发项目包括大数据、深度学习、网站、小程序、安卓、算法。平常会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 深度学校实战项目 计算机毕业设计选题推荐

酒店评论文本情感分析研究介绍

本系统是一个基于Hadoop分布式存储和Spark大数据处理框架的酒店情感分析平台,专门针对酒店行业的用户评论进行智能化情感识别与分析。系统采用传统机器学习算法与现代深度学习技术相结合的混合分析架构,能够高效处理海量酒店评论数据,自动识别用户情感倾向,并生成详细的情感分析报告。平台集成了数据采集、预处理、特征提取、情感分类、结果可视化等完整的数据处理流水线,支持实时和批量两种处理模式。系统通过对比传统统计分析方法与智能算法的处理效果,为酒店管理者提供客观的服务质量评估依据。同时,系统具备良好的扩展性和容错性,能够适应不同规模的数据处理需求,为酒店行业的数字化转型提供技术支撑,帮助提升客户满意度和服务质量。

酒店评论文本情感分析研究演示视频

演示视频

酒店评论文本情感分析研究演示图片

在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述

酒店评论文本情感分析研究代码展示

from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, when, regexp_replace, lower
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import jieba
import re

spark = SparkSession.builder.appName("HotelSentimentAnalysis").config("spark.sql.adaptive.enabled", "true").config("spark.sql.adaptive.coalescePartitions.enabled", "true").getOrCreate()

def preprocessTextData(df):
    cleaned_df = df.withColumn("cleaned_text", regexp_replace(col("review_text"), "[^\\u4e00-\\u9fa5a-zA-Z0-9]", " "))
    cleaned_df = cleaned_df.withColumn("cleaned_text", lower(col("cleaned_text")))
    cleaned_df = cleaned_df.filter(col("cleaned_text").isNotNull() & (col("cleaned_text") != ""))
    segmented_rdd = cleaned_df.rdd.map(lambda row: (row.review_id, row.rating, " ".join(jieba.cut(row.cleaned_text))))
    segmented_df = spark.createDataFrame(segmented_rdd, ["review_id", "rating", "segmented_text"])
    labeled_df = segmented_df.withColumn("sentiment_label", when(col("rating") >= 4, 1).otherwise(0))
    tokenizer = Tokenizer(inputCol="segmented_text", outputCol="words")
    tokenized_df = tokenizer.transform(labeled_df)
    chinese_stopwords = ["的", "了", "在", "是", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这"]
    remover = StopWordsRemover(inputCol="words", outputCol="filtered_words", stopWords=chinese_stopwords)
    filtered_df = remover.transform(tokenized_df)
    return filtered_df.select("review_id", "sentiment_label", "filtered_words")

def buildTraditionalModel(train_df, test_df):
    cv = CountVectorizer(inputCol="filtered_words", outputCol="raw_features", vocabSize=10000, minDF=2.0)
    cv_model = cv.fit(train_df)
    train_cv = cv_model.transform(train_df)
    test_cv = cv_model.transform(test_df)
    idf = IDF(inputCol="raw_features", outputCol="features")
    idf_model = idf.fit(train_cv)
    train_tfidf = idf_model.transform(train_cv)
    test_tfidf = idf_model.transform(test_cv)
    lr = LogisticRegression(featuresCol="features", labelCol="sentiment_label", maxIter=100, regParam=0.01)
    lr_model = lr.fit(train_tfidf)
    train_predictions = lr_model.transform(train_tfidf)
    test_predictions = lr_model.transform(test_tfidf)
    evaluator = BinaryClassificationEvaluator(labelCol="sentiment_label", rawPredictionCol="rawPrediction")
    train_auc = evaluator.evaluate(train_predictions)
    test_auc = evaluator.evaluate(test_predictions)
    accuracy_train = train_predictions.filter(col("prediction") == col("sentiment_label")).count() / train_predictions.count()
    accuracy_test = test_predictions.filter(col("prediction") == col("sentiment_label")).count() / test_predictions.count()
    return {"model": lr_model, "train_auc": train_auc, "test_auc": test_auc, "train_accuracy": accuracy_train, "test_accuracy": accuracy_test, "predictions": test_predictions}

def buildIntelligentModel(train_df, test_df):
    cv = CountVectorizer(inputCol="filtered_words", outputCol="raw_features", vocabSize=15000, minDF=1.0)
    cv_model = cv.fit(train_df)
    train_cv = cv_model.transform(train_df)
    test_cv = cv_model.transform(test_df)
    idf = IDF(inputCol="raw_features", outputCol="features", minDocFreq=1)
    idf_model = idf.fit(train_cv)
    train_tfidf = idf_model.transform(train_cv)
    test_tfidf = idf_model.transform(test_cv)
    nb = NaiveBayes(featuresCol="features", labelCol="sentiment_label", smoothing=2.0)
    nb_model = nb.fit(train_tfidf)
    train_predictions = nb_model.transform(train_tfidf)
    test_predictions = nb_model.transform(test_tfidf)
    weighted_predictions = test_predictions.withColumn("confidence_score", col("probability").getItem(1))
    weighted_predictions = weighted_predictions.withColumn("adjusted_prediction", when(col("confidence_score") > 0.7, 1).when(col("confidence_score") < 0.3, 0).otherwise(col("prediction")))
    evaluator = BinaryClassificationEvaluator(labelCol="sentiment_label", rawPredictionCol="rawPrediction")
    train_auc = evaluator.evaluate(train_predictions)
    test_auc = evaluator.evaluate(test_predictions)
    accuracy_train = train_predictions.filter(col("prediction") == col("sentiment_label")).count() / train_predictions.count()
    accuracy_test = weighted_predictions.filter(col("adjusted_prediction") == col("sentiment_label")).count() / weighted_predictions.count()
    sentiment_distribution = weighted_predictions.groupBy("adjusted_prediction").count().collect()
    return {"model": nb_model, "train_auc": train_auc, "test_auc": test_auc, "train_accuracy": accuracy_train, "test_accuracy": accuracy_test, "predictions": weighted_predictions, "distribution": sentiment_distribution}

def compareModelPerformance(traditional_results, intelligent_results):
    comparison_metrics = {}
    comparison_metrics["traditional_accuracy"] = traditional_results["test_accuracy"]
    comparison_metrics["intelligent_accuracy"] = intelligent_results["test_accuracy"]
    comparison_metrics["accuracy_improvement"] = intelligent_results["test_accuracy"] - traditional_results["test_accuracy"]
    comparison_metrics["traditional_auc"] = traditional_results["test_auc"]
    comparison_metrics["intelligent_auc"] = intelligent_results["test_auc"]
    comparison_metrics["auc_improvement"] = intelligent_results["test_auc"] - traditional_results["test_auc"]
    traditional_positive = traditional_results["predictions"].filter(col("prediction") == 1).count()
    traditional_negative = traditional_results["predictions"].filter(col("prediction") == 0).count()
    intelligent_positive = intelligent_results["predictions"].filter(col("adjusted_prediction") == 1).count()
    intelligent_negative = intelligent_results["predictions"].filter(col("adjusted_prediction") == 0).count()
    comparison_metrics["traditional_positive_ratio"] = traditional_positive / (traditional_positive + traditional_negative)
    comparison_metrics["intelligent_positive_ratio"] = intelligent_positive / (intelligent_positive + intelligent_negative)
    traditional_conf_matrix = traditional_results["predictions"].crosstab("sentiment_label", "prediction")
    intelligent_conf_matrix = intelligent_results["predictions"].crosstab("sentiment_label", "adjusted_prediction")
    comparison_metrics["traditional_precision"] = calculatePrecision(traditional_results["predictions"], "prediction")
    comparison_metrics["intelligent_precision"] = calculatePrecision(intelligent_results["predictions"], "adjusted_prediction")
    comparison_metrics["traditional_recall"] = calculateRecall(traditional_results["predictions"], "prediction")
    comparison_metrics["intelligent_recall"] = calculateRecall(intelligent_results["predictions"], "adjusted_prediction")
    performance_summary = spark.createDataFrame([(traditional_results["test_accuracy"], intelligent_results["test_accuracy"], comparison_metrics["accuracy_improvement"], traditional_results["test_auc"], intelligent_results["test_auc"], comparison_metrics["auc_improvement"])], ["traditional_acc", "intelligent_acc", "acc_gain", "traditional_auc", "intelligent_auc", "auc_gain"])
    return comparison_metrics, performance_summary

def calculatePrecision(predictions_df, prediction_col):
    true_positive = predictions_df.filter((col("sentiment_label") == 1) & (col(prediction_col) == 1)).count()
    false_positive = predictions_df.filter((col("sentiment_label") == 0) & (col(prediction_col) == 1)).count()
    if true_positive + false_positive == 0:
        return 0.0
    return true_positive / (true_positive + false_positive)

def calculateRecall(predictions_df, prediction_col):
    true_positive = predictions_df.filter((col("sentiment_label") == 1) & (col(prediction_col) == 1)).count()
    false_negative = predictions_df.filter((col("sentiment_label") == 1) & (col(prediction_col) == 0)).count()
    if true_positive + false_negative == 0:
        return 0.0
    return true_positive / (true_positive + false_negative)

酒店评论文本情感分析研究文档展示

在这里插入图片描述

💖💖作者:计算机毕业设计杰瑞 💙💙个人简介:曾长期从事计算机专业培训教学,本人也热爱上课教学,语言擅长Java、微信小程序、Python、Golang、安卓Android等,开发项目包括大数据、深度学习、网站、小程序、安卓、算法。平常会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 深度学校实战项目 计算机毕业设计选题推荐