一、个人简介
💖💖作者:计算机编程果茶熊 💙💙个人简介:曾长期从事计算机专业培训教学,担任过编程老师,同时本人也热爱上课教学,擅长Java、微信小程序、Python、Golang、安卓Android等多个IT方向。会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 计算机毕业设计选题 💕💕文末获取源码联系计算机编程果茶熊
二、系统介绍
大数据框架:Hadoop+Spark(Hive需要定制修改) 开发语言:Java+Python(两个版本都支持) 数据库:MySQL 后端框架:SpringBoot(Spring+SpringMVC+Mybatis)+Django(两个版本都支持) 前端:Vue+Echarts+HTML+CSS+JavaScript+jQuery
三、视频解说
四、部分功能展示
五、部分代码展示
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
import pandas as pd
import numpy as np
from django.http import JsonResponse
from django.views import View
import json
spark = SparkSession.builder.appName("CosmeticsAnalysis").config("spark.sql.adaptive.enabled", "true").getOrCreate()
def brand_product_analysis(request):
df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/cosmetics").option("dbtable", "products").option("user", "root").option("password", "password").load()
brand_counts = df.groupBy("brand_name").agg(count("product_id").alias("product_count"), countDistinct("category").alias("category_breadth"))
brand_stats = brand_counts.withColumn("diversity_score", col("category_breadth") * 0.3 + col("product_count") * 0.7)
top_brands = brand_stats.orderBy(desc("diversity_score")).limit(20)
brand_analysis = top_brands.withColumn("market_position", when(col("product_count") > 50, "领导者").when(col("product_count") > 20, "挑战者").otherwise("追随者"))
result_data = brand_analysis.collect()
processed_results = []
for row in result_data:
brand_info = {"brand": row["brand_name"], "products": row["product_count"], "categories": row["category_breadth"], "score": round(row["diversity_score"], 2), "position": row["market_position"]}
processed_results.append(brand_info)
response_data = {"status": "success", "data": processed_results, "total": len(processed_results)}
return JsonResponse(response_data, safe=False)
def category_price_analysis(request):
products_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/cosmetics").option("dbtable", "products").option("user", "root").option("password", "password").load()
price_filtered = products_df.filter((col("price") > 0) & (col("price") < 10000))
category_stats = price_filtered.groupBy("category").agg(avg("price").alias("avg_price"), stddev("price").alias("price_std"), count("*").alias("product_count"), min("price").alias("min_price"), max("price").alias("max_price"))
category_with_range = category_stats.withColumn("price_range", col("max_price") - col("min_price")).withColumn("price_cv", col("price_std") / col("avg_price"))
rating_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/cosmetics").option("dbtable", "reviews").option("user", "root").option("password", "password").load()
avg_ratings = rating_df.groupBy("product_id").agg(avg("rating").alias("avg_rating"))
products_with_rating = products_df.join(avg_ratings, "product_id", "left")
category_rating_stats = products_with_rating.groupBy("category").agg(avg("avg_rating").alias("category_rating"), count("*").alias("rated_products"))
final_analysis = category_with_range.join(category_rating_stats, "category", "inner")
sorted_results = final_analysis.orderBy(desc("avg_price"))
analysis_results = sorted_results.collect()
formatted_data = []
for row in analysis_results:
category_data = {"category": row["category"], "average_price": round(row["avg_price"], 2), "price_std": round(row["price_std"], 2), "product_count": row["product_count"], "price_range": round(row["price_range"], 2), "rating": round(row["category_rating"], 2) if row["category_rating"] else 0}
formatted_data.append(category_data)
return JsonResponse({"success": True, "data": formatted_data})
def price_rating_clustering(request):
products_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/cosmetics").option("dbtable", "products").option("user", "root").option("password", "password").load()
reviews_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/cosmetics").option("dbtable", "reviews").option("user", "root").option("password", "password").load()
product_ratings = reviews_df.groupBy("product_id").agg(avg("rating").alias("avg_rating"), count("*").alias("review_count"))
clustering_data = products_df.join(product_ratings, "product_id", "inner").filter((col("price") > 0) & (col("avg_rating") > 0))
price_normalized = clustering_data.withColumn("price_log", log(col("price") + 1))
feature_data = price_normalized.select("product_id", "price_log", "avg_rating", "review_count").na.drop()
assembler = VectorAssembler(inputCols=["price_log", "avg_rating", "review_count"], outputCol="features")
feature_vector = assembler.transform(feature_data)
kmeans = KMeans(k=5, seed=42, maxIter=100, featuresCol="features", predictionCol="cluster")
model = kmeans.fit(feature_vector)
clustered_data = model.transform(feature_vector)
cluster_stats = clustered_data.groupBy("cluster").agg(count("*").alias("cluster_size"), avg("price_log").alias("avg_log_price"), avg("avg_rating").alias("cluster_rating"), avg("review_count").alias("avg_reviews"))
cluster_analysis = cluster_stats.withColumn("actual_price", exp(col("avg_log_price")) - 1)
cluster_results = cluster_analysis.collect()
cluster_info = []
for row in cluster_results:
cluster_data = {"cluster_id": row["cluster"], "size": row["cluster_size"], "avg_price": round(row["actual_price"], 2), "avg_rating": round(row["cluster_rating"], 2), "avg_reviews": round(row["avg_reviews"], 0)}
if cluster_data["avg_price"] < 100 and cluster_data["avg_rating"] > 4.0:
cluster_data["label"] = "性价比优选"
elif cluster_data["avg_price"] > 500:
cluster_data["label"] = "高端产品"
else:
cluster_data["label"] = "中端产品"
cluster_info.append(cluster_data)
response = {"status": "completed", "clusters": cluster_info, "total_products": sum([c["size"] for c in cluster_info])}
return JsonResponse(response)
六、部分文档展示
七、END
💕💕文末获取源码联系计算机编程果茶熊