前言
- 💖💖作者:计算机程序员小杨
- 💙💙个人简介:我是一名计算机相关专业的从业者,擅长Java、微信小程序、Python、Golang、安卓Android等多个IT方向。会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。热爱技术,喜欢钻研新工具和框架,也乐于通过代码解决实际问题,大家有技术代码这一块的问题可以问我!
- 💛💛想说的话:感谢大家的关注与支持!
- 💕💕文末获取源码联系 计算机程序员小杨
- 💜💜
- 网站实战项目
- 安卓/小程序实战项目
- 大数据实战项目
- 深度学习实战项目
- 计算机毕业设计选题
- 💜💜
一.开发工具简介
- 大数据框架:Hadoop+Spark(本次没用Hive,支持定制)
- 开发语言:Python+Java(两个版本都支持)
- 后端框架:Django+Spring Boot(Spring+SpringMVC+Mybatis)(两个版本都支持)
- 前端:Vue+ElementUI+Echarts+HTML+CSS+JavaScript+jQuery
- 详细技术点:Hadoop、HDFS、Spark、Spark SQL、Pandas、NumPy
- 数据库:MySQL
二.系统内容简介
基于大数据的超市销售数据统计分析系统是一套完整的数据处理与分析解决方案,采用Hadoop分布式文件系统作为数据存储底层,结合Spark大数据计算框架实现海量销售数据的高效处理。系统通过HDFS存储超市日常产生的销售流水、商品信息、顾客消费记录等大规模数据集,利用Spark SQL进行复杂的数据查询和统计计算。前端采用Vue框架结合ElementUI组件库构建用户交互界面,通过Echarts图表库实现数据的可视化展示,包括销售趋势图、商品关联度分析图、顾客消费行为热力图等多种图表形式。系统核心功能涵盖顾客消费行为深度分析、商品关联性挖掘、多维度销售数据统计、促销活动效果评估以及时间序列销售趋势分析,同时提供数据大屏展示功能,实时监控关键业务指标。后端基于SpringBoot框架构建RESTful API接口,通过Mybatis实现与MySQL数据库的交互,确保元数据管理和用户信息存储的可靠性。整个系统架构支持大数据量的并发处理,能够处理TB级别的销售数据,为超市经营决策提供科学的数据支撑。
三.系统功能演示
7天搞定大数据毕设:超市销售数据统计分析系统从Hadoop搭建到Spark实战完整教程
四.系统界面展示
五.系统源码展示
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import numpy as np
spark = SparkSession.builder.appName("SupermarketAnalysis").config("spark.sql.adaptive.enabled", "true").getOrCreate()
def customer_behavior_analysis():
sales_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/supermarket").option("dbtable", "sales_records").option("user", "root").option("password", "password").load()
customer_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/supermarket").option("dbtable", "customers").option("user", "root").option("password", "password").load()
joined_df = sales_df.join(customer_df, "customer_id")
customer_stats = joined_df.groupBy("customer_id", "customer_name").agg(
sum("amount").alias("total_spending"),
count("order_id").alias("purchase_frequency"),
avg("amount").alias("avg_order_value"),
countDistinct("product_id").alias("product_variety"),
max("purchase_date").alias("last_purchase_date"),
min("purchase_date").alias("first_purchase_date")
)
customer_stats = customer_stats.withColumn("customer_lifetime_days",
datediff(col("last_purchase_date"), col("first_purchase_date")))
customer_stats = customer_stats.withColumn("purchase_per_day",
when(col("customer_lifetime_days") > 0, col("purchase_frequency") / col("customer_lifetime_days")).otherwise(0))
high_value_customers = customer_stats.filter(col("total_spending") > customer_stats.select(percentile_approx("total_spending", 0.8)).collect()[0][0])
frequent_customers = customer_stats.filter(col("purchase_frequency") > customer_stats.select(percentile_approx("purchase_frequency", 0.7)).collect()[0][0])
customer_segments = customer_stats.withColumn("customer_segment",
when((col("total_spending") > 1000) & (col("purchase_frequency") > 10), "VIP客户")
.when((col("total_spending") > 500) & (col("purchase_frequency") > 5), "高价值客户")
.when(col("purchase_frequency") > 3, "活跃客户")
.otherwise("普通客户"))
result = customer_segments.select("customer_id", "customer_name", "total_spending", "purchase_frequency", "avg_order_value", "customer_segment").collect()
return [row.asDict() for row in result]
def product_association_analysis():
sales_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/supermarket").option("dbtable", "sales_records").option("user", "root").option("password", "password").load()
product_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/supermarket").option("dbtable", "products").option("user", "root").option("password", "password").load()
order_products = sales_df.select("order_id", "product_id").distinct()
product_pairs = order_products.alias("a").join(order_products.alias("b"), col("a.order_id") == col("b.order_id")).filter(col("a.product_id") < col("b.product_id"))
association_counts = product_pairs.groupBy("a.product_id", "b.product_id").count().alias("support_count")
total_orders = sales_df.select("order_id").distinct().count()
product_a_counts = sales_df.groupBy("product_id").agg(countDistinct("order_id").alias("product_a_count"))
product_b_counts = sales_df.groupBy("product_id").agg(countDistinct("order_id").alias("product_b_count"))
association_with_support = association_counts.join(product_a_counts, association_counts["a.product_id"] == product_a_counts["product_id"]).join(product_b_counts, association_counts["b.product_id"] == product_b_counts["product_id"])
association_metrics = association_with_support.withColumn("support", col("count") / total_orders).withColumn("confidence_a_to_b", col("count") / col("product_a_count")).withColumn("confidence_b_to_a", col("count") / col("product_b_count"))
association_metrics = association_metrics.withColumn("lift", col("confidence_a_to_b") / (col("product_b_count") / total_orders))
strong_associations = association_metrics.filter((col("support") > 0.01) & (col("confidence_a_to_b") > 0.1) & (col("lift") > 1.2))
product_info_a = product_df.select(col("product_id").alias("product_a_id"), col("product_name").alias("product_a_name"))
product_info_b = product_df.select(col("product_id").alias("product_b_id"), col("product_name").alias("product_b_name"))
final_associations = strong_associations.join(product_info_a, strong_associations["a.product_id"] == product_info_a["product_a_id"]).join(product_info_b, strong_associations["b.product_id"] == product_info_b["product_b_id"])
result = final_associations.select("product_a_name", "product_b_name", "support", "confidence_a_to_b", "lift").orderBy(desc("lift")).collect()
return [row.asDict() for row in result]
def sales_trend_analysis():
sales_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/supermarket").option("dbtable", "sales_records").option("user", "root").option("password", "password").load()
product_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/supermarket").option("dbtable", "products").option("user", "root").option("password", "password").load()
sales_with_date = sales_df.withColumn("sale_date", to_date(col("purchase_date"))).withColumn("sale_month", date_format(col("purchase_date"), "yyyy-MM")).withColumn("sale_week", date_format(col("purchase_date"), "yyyy-ww"))
daily_sales = sales_with_date.groupBy("sale_date").agg(sum("amount").alias("daily_revenue"), sum("quantity").alias("daily_quantity"), count("order_id").alias("daily_orders"))
monthly_sales = sales_with_date.groupBy("sale_month").agg(sum("amount").alias("monthly_revenue"), sum("quantity").alias("monthly_quantity"), count("order_id").alias("monthly_orders"))
weekly_sales = sales_with_date.groupBy("sale_week").agg(sum("amount").alias("weekly_revenue"), sum("quantity").alias("weekly_quantity"), count("order_id").alias("weekly_orders"))
product_sales = sales_df.join(product_df, "product_id").groupBy("product_id", "product_name", "category").agg(sum("amount").alias("product_revenue"), sum("quantity").alias("product_quantity"))
top_products = product_sales.orderBy(desc("product_revenue")).limit(20)
category_sales = product_sales.groupBy("category").agg(sum("product_revenue").alias("category_revenue"), sum("product_quantity").alias("category_quantity"))
hour_sales = sales_with_date.withColumn("sale_hour", hour(col("purchase_date"))).groupBy("sale_hour").agg(sum("amount").alias("hourly_revenue"), count("order_id").alias("hourly_orders"))
weekday_sales = sales_with_date.withColumn("weekday", dayofweek(col("sale_date"))).groupBy("weekday").agg(sum("amount").alias("weekday_revenue"), avg("amount").alias("avg_weekday_revenue"))
growth_analysis = daily_sales.withColumn("prev_day_revenue", lag("daily_revenue").over(Window.orderBy("sale_date"))).withColumn("growth_rate", (col("daily_revenue") - col("prev_day_revenue")) / col("prev_day_revenue") * 100)
result = {
"daily_trend": [row.asDict() for row in daily_sales.orderBy("sale_date").collect()],
"monthly_trend": [row.asDict() for row in monthly_sales.orderBy("sale_month").collect()],
"top_products": [row.asDict() for row in top_products.collect()],
"category_performance": [row.asDict() for row in category_sales.orderBy(desc("category_revenue")).collect()],
"hourly_pattern": [row.asDict() for row in hour_sales.orderBy("sale_hour").collect()]
}
return result