💖💖作者:计算机毕业设计杰瑞 💙💙个人简介:曾长期从事计算机专业培训教学,本人也热爱上课教学,语言擅长Java、微信小程序、Python、Golang、安卓Android等,开发项目包括大数据、深度学习、网站、小程序、安卓、算法。平常会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 深度学校实战项目 计算机毕业设计选题推荐
基于机器学习的电商评论情感分析介绍
本电商评论情感分析系统采用Hadoop+Spark大数据框架构建,专门针对京东等电商平台的用户评论进行深度情感挖掘和智能分析。系统运用Spark SQL进行海量评论数据的分布式处理,结合Pandas和NumPy进行数据预处理和特征工程,通过Django后端框架提供稳定的API服务,前端采用Vue+ElementUI构建直观的操作界面。系统核心功能涵盖用户权限管理、京东评论数据的批量导入与清洗、基于机器学习算法的情感评分预测、多维度数据统计分析以及个性化的系统配置管理。整个系统充分发挥Spark分布式计算的优势,能够高效处理大规模评论文本数据,实现从数据采集、预处理、特征提取到情感分类的完整工作流,为电商企业提供用户情感洞察和商品口碑分析的技术支撑,同时通过Echarts图表组件实现数据可视化展示,让复杂的情感分析结果以直观友好的方式呈现给用户。
基于机器学习的电商评论情感分析演示视频
基于机器学习的电商评论情感分析演示图片
基于机器学习的电商评论情感分析代码展示
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, regexp_replace, length, avg, count, desc
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
import pandas as pd
import numpy as np
import jieba
import re
spark = SparkSession.builder.appName("EcommerceCommentAnalysis").config("spark.sql.adaptive.enabled", "true").config("spark.sql.adaptive.coalescePartitions.enabled", "true").getOrCreate()
def process_jd_comment_data(file_path, table_name):
raw_df = spark.read.option("header", "true").option("encoding", "UTF-8").csv(file_path)
cleaned_df = raw_df.filter(col("comment_text").isNotNull() & (length(col("comment_text")) > 10))
cleaned_df = cleaned_df.withColumn("comment_text", regexp_replace(col("comment_text"), "[^\u4e00-\u9fa5a-zA-Z0-9\s]", ""))
cleaned_df = cleaned_df.withColumn("rating", when(col("rating") > 3, 1).otherwise(0))
cleaned_df = cleaned_df.withColumn("comment_length", length(col("comment_text")))
filtered_df = cleaned_df.filter((col("comment_length") >= 10) & (col("comment_length") <= 500))
deduplicated_df = filtered_df.dropDuplicates(["comment_text"])
sample_df = deduplicated_df.sample(withReplacement=False, fraction=0.1, seed=42)
sample_df.createOrReplaceTempView(table_name)
result_count = sample_df.count()
positive_count = sample_df.filter(col("rating") == 1).count()
negative_count = sample_df.filter(col("rating") == 0).count()
avg_length = sample_df.select(avg(col("comment_length"))).collect()[0][0]
processed_data = sample_df.select("comment_text", "rating", "comment_length", "product_id", "user_id").collect()
pandas_df = pd.DataFrame([row.asDict() for row in processed_data])
return {"total_count": result_count, "positive_ratio": positive_count/result_count, "negative_ratio": negative_count/result_count, "avg_length": avg_length, "processed_data": pandas_df}
def predict_sentiment_score(comment_text, model_path=None):
input_df = spark.createDataFrame([(comment_text, 0)], ["comment_text", "rating"])
cleaned_text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s]', '', comment_text)
words = jieba.lcut(cleaned_text)
filtered_words = [word for word in words if len(word) > 1 and word not in ['的', '了', '是', '在', '有', '和', '就', '都', '而', '及', '与', '或']]
word_count = len(filtered_words)
positive_words = ['好', '不错', '满意', '喜欢', '推荐', '优秀', '完美', '赞', '棒', '超赞', '给力', '值得', '划算', '实惠']
negative_words = ['差', '坏', '失望', '糟糕', '垃圾', '后悔', '不满', '问题', '瑕疵', '缺陷', '故障', '损坏', '破损', '不值']
positive_count = sum(1 for word in filtered_words if word in positive_words)
negative_count = sum(1 for word in filtered_words if word in negative_words)
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=1000)
idf = IDF(inputCol="raw_features", outputCol="features")
lr = LogisticRegression(featuresCol="features", labelCol="rating", maxIter=100)
pipeline = Pipeline(stages=[tokenizer, stopwords_remover, hashing_tf, idf, lr])
if model_path:
model = pipeline.fit(input_df)
predictions = model.transform(input_df)
else:
sentiment_score = (positive_count - negative_count) / max(word_count, 1) * 100
sentiment_score = max(-100, min(100, sentiment_score))
probability = (sentiment_score + 100) / 200
predictions = input_df.withColumn("prediction", when(probability > 0.5, 1).otherwise(0))
predictions = predictions.withColumn("probability", lit(probability))
result = predictions.select("prediction", "probability").collect()[0]
return {"sentiment_label": int(result.prediction), "confidence_score": float(result.probability), "positive_words": positive_count, "negative_words": negative_count, "total_words": word_count}
def generate_analysis_statistics(date_range=None, product_category=None):
base_query = "SELECT * FROM jd_comments_processed WHERE 1=1"
if date_range:
base_query += f" AND comment_date BETWEEN '{date_range[0]}' AND '{date_range[1]}'"
if product_category:
base_query += f" AND product_category = '{product_category}'"
df = spark.sql(base_query)
total_comments = df.count()
positive_comments = df.filter(col("rating") == 1).count()
negative_comments = df.filter(col("rating") == 0).count()
sentiment_distribution = df.groupBy("rating").count().orderBy("rating")
product_sentiment = df.groupBy("product_id").agg(avg("rating").alias("avg_rating"), count("*").alias("comment_count")).filter(col("comment_count") >= 10).orderBy(desc("avg_rating"))
daily_stats = df.groupBy("comment_date").agg(count("*").alias("daily_count"), avg("rating").alias("daily_avg_rating")).orderBy("comment_date")
word_frequency = df.select("comment_text").rdd.flatMap(lambda row: jieba.lcut(row.comment_text)).filter(lambda word: len(word) > 1).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b).sortBy(lambda x: x[1], ascending=False).take(50)
sentiment_trend = daily_stats.select("comment_date", "daily_avg_rating").collect()
high_rating_products = product_sentiment.filter(col("avg_rating") >= 0.8).select("product_id", "avg_rating").collect()
low_rating_products = product_sentiment.filter(col("avg_rating") <= 0.3).select("product_id", "avg_rating").collect()
statistics_result = {"total_comments": total_comments, "positive_rate": positive_comments/total_comments if total_comments > 0 else 0, "negative_rate": negative_comments/total_comments if total_comments > 0 else 0, "sentiment_distribution": [{"rating": row.rating, "count": row.count} for row in sentiment_distribution.collect()], "top_products": [{"product_id": row.product_id, "avg_rating": float(row.avg_rating)} for row in high_rating_products], "bottom_products": [{"product_id": row.product_id, "avg_rating": float(row.avg_rating)} for row in low_rating_products], "daily_trend": [{"date": row.comment_date, "avg_rating": float(row.daily_avg_rating)} for row in sentiment_trend], "word_cloud_data": [{"word": word, "frequency": freq} for word, freq in word_frequency]}
return statistics_result
基于机器学习的电商评论情感分析文档展示
💖💖作者:计算机毕业设计杰瑞 💙💙个人简介:曾长期从事计算机专业培训教学,本人也热爱上课教学,语言擅长Java、微信小程序、Python、Golang、安卓Android等,开发项目包括大数据、深度学习、网站、小程序、安卓、算法。平常会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 深度学校实战项目 计算机毕业设计选题推荐