前言
💖💖作者:计算机程序员小杨 💙💙个人简介:我是一名计算机相关专业的从业者,擅长Java、微信小程序、Python、Golang、安卓Android等多个IT方向。会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。热爱技术,喜欢钻研新工具和框架,也乐于通过代码解决实际问题,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💕💕文末获取源码联系 计算机程序员小杨 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 深度学习实战项目 计算机毕业设计选题 💜💜
一.开发工具简介
大数据框架:Hadoop+Spark(本次没用Hive,支持定制) 开发语言:Python+Java(两个版本都支持) 后端框架:Django+Spring Boot(Spring+SpringMVC+Mybatis)(两个版本都支持) 前端:Vue+ElementUI+Echarts+HTML+CSS+JavaScript+jQuery 详细技术点:Hadoop、HDFS、Spark、Spark SQL、Pandas、NumPy 数据库:MySQL
二.系统内容简介
《基于大数据的商店购物趋势分析与可视化系统》是一个集数据采集、处理、分析和可视化于一体的综合性大数据应用系统。该系统采用Hadoop分布式存储架构和Spark大数据处理引擎作为核心技术,通过Python和Django框架构建后端服务,前端采用Vue+ElementUI+Echarts技术栈实现用户交互界面。系统具备商店购物趋势数据管理、消费行为深度分析、客户画像智能构建、客户价值精准评估、销售业绩综合统计以及数据可视化大屏展示等核心功能模块。通过Spark SQL和Pandas进行数据清洗与预处理,利用NumPy进行数学运算分析,结合MySQL数据库实现数据的持久化存储。系统能够处理海量商店交易数据,挖掘隐藏在数据背后的商业价值,为商家提供科学的决策支持,同时为学习者提供了一个完整的大数据技术实践平台,涵盖了从数据存储到分析展示的全链路技术实现。
三.系统功能演示
计算机专业的痛点:毕设选啥好?基于大数据的商店购物趋势分析系统拯救你|毕设|计算机毕设|程序开发|项目实战
四.系统界面展示
五.系统源码展示
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, sum, avg, max, min, desc, asc, when, isnan, isnull
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
import pandas as pd
import numpy as np
from django.http import JsonResponse
from django.views.decorators.csrf import csrf_exempt
import json
from datetime import datetime, timedelta
import mysql.connector
from collections import defaultdict
import math
spark = SparkSession.builder.appName("ShopTrendAnalysis").config("spark.sql.adaptive.enabled", "true").config("spark.sql.adaptive.coalescePartitions.enabled", "true").getOrCreate()
def consumer_behavior_analysis(request):
try:
transaction_df = spark.read.option("header", "true").csv("hdfs://localhost:9000/shop_data/transactions.csv")
transaction_df = transaction_df.withColumn("amount", col("amount").cast(DoubleType()))
transaction_df = transaction_df.withColumn("quantity", col("quantity").cast(IntegerType()))
transaction_df = transaction_df.withColumn("transaction_date", col("transaction_date").cast(TimestampType()))
monthly_behavior = transaction_df.groupBy("customer_id", "product_category").agg(
count("transaction_id").alias("purchase_frequency"),
sum("amount").alias("total_spending"),
avg("amount").alias("avg_spending"),
sum("quantity").alias("total_quantity")
)
customer_segments = monthly_behavior.groupBy("customer_id").agg(
sum("purchase_frequency").alias("total_purchases"),
sum("total_spending").alias("customer_total_spending"),
avg("avg_spending").alias("customer_avg_spending"),
count("product_category").alias("category_diversity")
)
spending_percentiles = customer_segments.select("customer_total_spending").rdd.map(lambda x: x[0]).collect()
spending_array = np.array(spending_percentiles)
percentile_25 = np.percentile(spending_array, 25)
percentile_75 = np.percentile(spending_array, 75)
customer_behavior_segments = customer_segments.withColumn("behavior_segment",
when(col("customer_total_spending") >= percentile_75, "高价值客户")
.when(col("customer_total_spending") >= percentile_25, "中等价值客户")
.otherwise("低价值客户")
)
time_pattern_analysis = transaction_df.withColumn("hour", col("transaction_date").cast("string").substr(12, 2).cast(IntegerType()))
hourly_patterns = time_pattern_analysis.groupBy("hour").agg(
count("transaction_id").alias("transaction_count"),
sum("amount").alias("hourly_revenue"),
avg("amount").alias("avg_transaction_value")
).orderBy("hour")
category_preferences = transaction_df.groupBy("customer_id", "product_category").agg(
count("transaction_id").alias("category_purchases"),
sum("amount").alias("category_spending")
)
customer_category_ranking = category_preferences.withColumn("spending_rank",
col("category_spending").desc()
).groupBy("customer_id").agg(
max("category_spending").alias("preferred_category_spending"),
count("product_category").alias("categories_purchased")
)
result_data = {
"behavior_segments": customer_behavior_segments.collect(),
"hourly_patterns": hourly_patterns.collect(),
"category_analysis": category_preferences.collect(),
"summary_stats": {
"total_customers": customer_segments.count(),
"avg_spending_per_customer": customer_segments.agg(avg("customer_total_spending")).collect()[0][0],
"high_value_customers": customer_behavior_segments.filter(col("behavior_segment") == "高价值客户").count()
}
}
return JsonResponse({"status": "success", "data": result_data})
except Exception as e:
return JsonResponse({"status": "error", "message": str(e)})
def customer_portrait_analysis(request):
try:
customer_df = spark.read.option("header", "true").csv("hdfs://localhost:9000/shop_data/customers.csv")
transaction_df = spark.read.option("header", "true").csv("hdfs://localhost:9000/shop_data/transactions.csv")
customer_df = customer_df.withColumn("age", col("age").cast(IntegerType()))
transaction_df = transaction_df.withColumn("amount", col("amount").cast(DoubleType()))
transaction_df = transaction_df.withColumn("quantity", col("quantity").cast(IntegerType()))
customer_transaction_summary = transaction_df.groupBy("customer_id").agg(
count("transaction_id").alias("total_transactions"),
sum("amount").alias("total_spending"),
avg("amount").alias("avg_transaction_value"),
max("transaction_date").alias("last_purchase_date"),
min("transaction_date").alias("first_purchase_date")
)
customer_portrait = customer_df.join(customer_transaction_summary, "customer_id", "left")
age_groups = customer_portrait.withColumn("age_group",
when(col("age") < 25, "青年群体")
.when(col("age") < 40, "中年群体")
.when(col("age") < 60, "中老年群体")
.otherwise("老年群体")
)
spending_tiers = age_groups.withColumn("spending_tier",
when(col("total_spending") >= 10000, "高消费层")
.when(col("total_spending") >= 5000, "中等消费层")
.when(col("total_spending") >= 1000, "低消费层")
.otherwise("潜在客户")
)
gender_analysis = spending_tiers.groupBy("gender", "age_group").agg(
count("customer_id").alias("customer_count"),
avg("total_spending").alias("avg_spending"),
avg("total_transactions").alias("avg_transactions")
)
loyalty_analysis = spending_tiers.withColumn("purchase_days",
(col("last_purchase_date").cast("long") - col("first_purchase_date").cast("long")) / 86400
).withColumn("loyalty_score",
(col("total_transactions") * 0.4) + (col("total_spending") / 1000 * 0.6)
)
category_preferences = transaction_df.groupBy("customer_id", "product_category").agg(
sum("amount").alias("category_spending"),
count("transaction_id").alias("category_frequency")
)
top_categories_per_customer = category_preferences.withColumn("rank",
col("category_spending").desc()
).groupBy("customer_id").agg(
max("category_spending").alias("top_category_spending"),
count("product_category").alias("categories_engaged")
)
final_portrait = loyalty_analysis.join(top_categories_per_customer, "customer_id", "left")
portrait_summary = {
"age_distribution": age_groups.groupBy("age_group").count().collect(),
"spending_distribution": spending_tiers.groupBy("spending_tier").count().collect(),
"gender_insights": gender_analysis.collect(),
"loyalty_metrics": loyalty_analysis.agg(
avg("loyalty_score").alias("avg_loyalty"),
max("loyalty_score").alias("max_loyalty"),
count("customer_id").alias("total_analyzed")
).collect()[0],
"detailed_portraits": final_portrait.limit(100).collect()
}
return JsonResponse({"status": "success", "data": portrait_summary})
except Exception as e:
return JsonResponse({"status": "error", "message": str(e)})
def sales_performance_analysis(request):
try:
sales_df = spark.read.option("header", "true").csv("hdfs://localhost:9000/shop_data/sales_data.csv")
product_df = spark.read.option("header", "true").csv("hdfs://localhost:9000/shop_data/products.csv")
sales_df = sales_df.withColumn("revenue", col("revenue").cast(DoubleType()))
sales_df = sales_df.withColumn("units_sold", col("units_sold").cast(IntegerType()))
sales_df = sales_df.withColumn("sale_date", col("sale_date").cast(TimestampType()))
product_df = product_df.withColumn("cost_price", col("cost_price").cast(DoubleType()))
daily_performance = sales_df.withColumn("sale_day", col("sale_date").cast("date")).groupBy("sale_day").agg(
sum("revenue").alias("daily_revenue"),
sum("units_sold").alias("daily_units"),
count("sale_id").alias("daily_transactions"),
avg("revenue").alias("avg_transaction_value")
).orderBy("sale_day")
product_performance = sales_df.join(product_df, "product_id", "inner").groupBy("product_id", "product_name", "category").agg(
sum("revenue").alias("product_revenue"),
sum("units_sold").alias("product_units"),
count("sale_id").alias("product_transactions"),
avg("revenue").alias("avg_product_value")
)
profit_analysis = product_performance.join(product_df.select("product_id", "cost_price"), "product_id", "inner")
profit_analysis = profit_analysis.withColumn("total_cost", col("product_units") * col("cost_price"))
profit_analysis = profit_analysis.withColumn("gross_profit", col("product_revenue") - col("total_cost"))
profit_analysis = profit_analysis.withColumn("profit_margin",
(col("gross_profit") / col("product_revenue")) * 100
)
category_performance = profit_analysis.groupBy("category").agg(
sum("product_revenue").alias("category_revenue"),
sum("product_units").alias("category_units"),
sum("gross_profit").alias("category_profit"),
avg("profit_margin").alias("avg_margin"),
count("product_id").alias("products_in_category")
).orderBy(desc("category_revenue"))
monthly_trends = sales_df.withColumn("year_month", col("sale_date").cast("string").substr(1, 7)).groupBy("year_month").agg(
sum("revenue").alias("monthly_revenue"),
sum("units_sold").alias("monthly_units"),
count("sale_id").alias("monthly_transactions")
).orderBy("year_month")
top_performers = product_performance.orderBy(desc("product_revenue")).limit(10)
low_performers = product_performance.orderBy(asc("product_revenue")).limit(10)
growth_analysis = monthly_trends.collect()
growth_rates = []
for i in range(1, len(growth_analysis)):
prev_revenue = growth_analysis[i-1]["monthly_revenue"]
curr_revenue = growth_analysis[i]["monthly_revenue"]
growth_rate = ((curr_revenue - prev_revenue) / prev_revenue) * 100 if prev_revenue > 0 else 0
growth_rates.append({
"month": growth_analysis[i]["year_month"],
"growth_rate": growth_rate,
"revenue": curr_revenue
})
performance_summary = {
"daily_trends": daily_performance.collect(),
"product_rankings": top_performers.collect(),
"category_analysis": category_performance.collect(),
"profit_metrics": profit_analysis.collect(),
"growth_trends": growth_rates,
"overall_stats": {
"total_revenue": sales_df.agg(sum("revenue")).collect()[0][0],
"total_units": sales_df.agg(sum("units_sold")).collect()[0][0],
"avg_daily_revenue": daily_performance.agg(avg("daily_revenue")).collect()[0][0],
"best_category": category_performance.first()
}
}
return JsonResponse({"status": "success", "data": performance_summary})
except Exception as e:
return JsonResponse({"status": "error", "message": str(e)})
六.系统文档展示
结束
💕💕文末获取源码联系 计算机程序员小杨