一、个人简介
💖💖作者:计算机编程果茶熊 💙💙个人简介:曾长期从事计算机专业培训教学,担任过编程老师,同时本人也热爱上课教学,擅长Java、微信小程序、Python、Golang、安卓Android等多个IT方向。会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 计算机毕业设计选题 💕💕文末获取源码联系计算机编程果茶熊
二、系统介绍
大数据框架:Hadoop+Spark(Hive需要定制修改) 开发语言:Java+Python(两个版本都支持) 数据库:MySQL 后端框架:SpringBoot(Spring+SpringMVC+Mybatis)+Django(两个版本都支持) 前端:Vue+Echarts+HTML+CSS+JavaScript+jQuery
音乐内容智能推荐与市场趋势分析系统是一个基于大数据技术的音乐行业分析平台,采用Hadoop+Spark分布式计算框架对海量音乐市场数据进行高效处理与深度挖掘。系统后端基于Django框架构建RESTful API接口,前端采用Vue+ElementUI实现交互界面,通过Echarts图表库将分析结果可视化呈现。核心功能涵盖音乐市场数据的采集管理、内容质量多维度评估、语言偏好统计分析、市场细分洞察、音乐类型流行趋势预测、时间季节性规律挖掘以及用户行为模式分析等模块。系统利用Spark SQL进行结构化数据查询,结合Pandas和NumPy进行数据清洗与统计计算,将处理后的数据存储至MySQL数据库中供业务调用。通过可视化大屏功能,平台能够实时展示市场动态、热门音乐类型分布、用户偏好变化等关键指标,为音乐内容创作者和平台运营者提供数据驱动的决策支持,帮助其精准把握市场风向、优化内容策略、提升用户体验和推荐效果。
三、视频解说
四、部分功能展示
五、部分代码展示
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, sum, when, lit, date_format, month, hour, percentile_approx, desc, asc, row_number, dense_rank
from pyspark.sql.window import Window
from django.http import JsonResponse
from django.views.decorators.http import require_http_methods
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import json
spark = SparkSession.builder.appName("MusicAnalysisSystem").config("spark.sql.warehouse.dir", "/user/hive/warehouse").config("spark.executor.memory", "4g").config("spark.driver.memory", "2g").enableHiveSupport().getOrCreate()
@require_http_methods(["POST"])
def analyze_music_content_quality(request):
data = json.loads(request.body)
start_date = data.get('start_date')
end_date = data.get('end_date')
music_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/music_db").option("driver", "com.mysql.jdbc.Driver").option("dbtable", "music_content").option("user", "root").option("password", "password").load()
behavior_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/music_db").option("driver", "com.mysql.jdbc.Driver").option("dbtable", "user_behavior").option("user", "root").option("password", "password").load()
filtered_behavior = behavior_df.filter((col("behavior_time") >= start_date) & (col("behavior_time") <= end_date))
quality_metrics = filtered_behavior.groupBy("music_id").agg(count(when(col("behavior_type") == "play", 1)).alias("play_count"),count(when(col("behavior_type") == "like", 1)).alias("like_count"),count(when(col("behavior_type") == "share", 1)).alias("share_count"),count(when(col("behavior_type") == "comment", 1)).alias("comment_count"),avg(when(col("behavior_type") == "play", col("play_duration"))).alias("avg_play_duration"))
joined_df = music_df.join(quality_metrics, "music_id", "left")
quality_score = joined_df.withColumn("engagement_rate", (col("like_count") + col("share_count") + col("comment_count")) / (col("play_count") + lit(1))).withColumn("completion_rate", col("avg_play_duration") / (col("music_duration") + lit(1))).withColumn("quality_score", (col("engagement_rate") * 0.4 + col("completion_rate") * 0.3 + (col("like_count") / (col("play_count") + lit(1))) * 0.3) * 100)
quality_distribution = quality_score.withColumn("quality_level", when(col("quality_score") >= 80, "优秀").when(col("quality_score") >= 60, "良好").when(col("quality_score") >= 40, "中等").otherwise("待提升"))
level_stats = quality_distribution.groupBy("quality_level").agg(count("*").alias("count")).orderBy(desc("quality_level"))
top_quality_music = quality_score.orderBy(desc("quality_score")).limit(20).select("music_id", "music_name", "artist", "quality_score", "engagement_rate", "completion_rate")
result_stats = level_stats.toPandas().to_dict('records')
result_top = top_quality_music.toPandas().to_dict('records')
quality_score.write.format("jdbc").option("url", "jdbc:mysql://localhost:3306/music_db").option("driver", "com.mysql.jdbc.Driver").option("dbtable", "music_quality_analysis").option("user", "root").option("password", "password").mode("overwrite").save()
return JsonResponse({"status": "success", "quality_distribution": result_stats, "top_quality_music": result_top})
@require_http_methods(["POST"])
def analyze_music_type_trend(request):
data = json.loads(request.body)
time_range = data.get('time_range', 90)
end_date = datetime.now()
start_date = end_date - timedelta(days=time_range)
behavior_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/music_db").option("driver", "com.mysql.jdbc.Driver").option("dbtable", "user_behavior").option("user", "root").option("password", "password").load()
music_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/music_db").option("driver", "com.mysql.jdbc.Driver").option("dbtable", "music_content").option("user", "root").option("password", "password").load()
filtered_behavior = behavior_df.filter((col("behavior_time") >= start_date.strftime('%Y-%m-%d')) & (col("behavior_time") <= end_date.strftime('%Y-%m-%d')) & (col("behavior_type") == "play"))
behavior_with_music = filtered_behavior.join(music_df.select("music_id", "music_type"), "music_id", "inner")
behavior_with_date = behavior_with_music.withColumn("behavior_date", date_format(col("behavior_time"), "yyyy-MM-dd"))
daily_type_stats = behavior_with_date.groupBy("behavior_date", "music_type").agg(count("*").alias("play_count"),count(col("user_id").distinct()).alias("unique_users"))
window_spec = Window.partitionBy("behavior_date").orderBy(desc("play_count"))
ranked_types = daily_type_stats.withColumn("rank", row_number().over(window_spec))
top_types_daily = ranked_types.filter(col("rank") <= 5)
overall_type_stats = behavior_with_music.groupBy("music_type").agg(count("*").alias("total_plays"),count(col("user_id").distinct()).alias("total_users"),avg("play_duration").alias("avg_duration"))
time_periods = behavior_with_music.withColumn("time_period", when(month(col("behavior_time")).isin([3,4,5]), "春季").when(month(col("behavior_time")).isin([6,7,8]), "夏季").when(month(col("behavior_time")).isin([9,10,11]), "秋季").otherwise("冬季"))
seasonal_type_stats = time_periods.groupBy("time_period", "music_type").agg(count("*").alias("play_count"))
window_seasonal = Window.partitionBy("time_period").orderBy(desc("play_count"))
seasonal_ranked = seasonal_type_stats.withColumn("season_rank", row_number().over(window_seasonal)).filter(col("season_rank") <= 3)
growth_analysis = daily_type_stats.groupBy("music_type").agg(sum("play_count").alias("total_count"),avg("play_count").alias("avg_daily_count"))
trend_data = top_types_daily.orderBy("behavior_date", "rank").toPandas()
overall_data = overall_type_stats.orderBy(desc("total_plays")).toPandas()
seasonal_data = seasonal_ranked.orderBy("time_period", "season_rank").toPandas()
trend_result = trend_data.to_dict('records')
overall_result = overall_data.to_dict('records')
seasonal_result = seasonal_data.to_dict('records')
overall_type_stats.write.format("jdbc").option("url", "jdbc:mysql://localhost:3306/music_db").option("driver", "com.mysql.jdbc.Driver").option("dbtable", "music_type_trend").option("user", "root").option("password", "password").mode("overwrite").save()
return JsonResponse({"status": "success", "daily_trend": trend_result, "overall_stats": overall_result, "seasonal_stats": seasonal_result})
@require_http_methods(["POST"])
def analyze_user_behavior_pattern(request):
data = json.loads(request.body)
analysis_days = data.get('analysis_days', 30)
end_date = datetime.now()
start_date = end_date - timedelta(days=analysis_days)
behavior_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/music_db").option("driver", "com.mysql.jdbc.Driver").option("dbtable", "user_behavior").option("user", "root").option("password", "password").load()
user_df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/music_db").option("driver", "com.mysql.jdbc.Driver").option("dbtable", "user_info").option("user", "root").option("password", "password").load()
filtered_behavior = behavior_df.filter((col("behavior_time") >= start_date.strftime('%Y-%m-%d')) & (col("behavior_time") <= end_date.strftime('%Y-%m-%d')))
behavior_with_hour = filtered_behavior.withColumn("hour", hour(col("behavior_time"))).withColumn("time_slot", when(col("hour").between(6, 11), "早晨(6-11)").when(col("hour").between(12, 17), "下午(12-17)").when(col("hour").between(18, 23), "晚上(18-23)").otherwise("深夜(0-5)"))
hourly_activity = behavior_with_hour.groupBy("hour", "behavior_type").agg(count("*").alias("action_count"))
timeslot_activity = behavior_with_hour.groupBy("time_slot", "behavior_type").agg(count("*").alias("action_count"),count(col("user_id").distinct()).alias("active_users"))
user_activity_level = filtered_behavior.groupBy("user_id").agg(count("*").alias("total_actions"),count(when(col("behavior_type") == "play", 1)).alias("play_count"),count(when(col("behavior_type") == "like", 1)).alias("like_count"),count(when(col("behavior_type") == "share", 1)).alias("share_count"),count(when(col("behavior_type") == "comment", 1)).alias("comment_count"))
user_activity_scored = user_activity_level.withColumn("activity_score", (col("play_count") * 1 + col("like_count") * 3 + col("share_count") * 5 + col("comment_count") * 4)).withColumn("user_level", when(col("activity_score") >= 500, "高活跃用户").when(col("activity_score") >= 200, "中活跃用户").when(col("activity_score") >= 50, "低活跃用户").otherwise("沉默用户"))
user_level_dist = user_activity_scored.groupBy("user_level").agg(count("*").alias("user_count"),avg("activity_score").alias("avg_score"))
user_with_demo = user_activity_scored.join(user_df.select("user_id", "age", "gender", "region"), "user_id", "left")
age_behavior = user_with_demo.groupBy("age", "user_level").agg(count("*").alias("count"))
gender_behavior = user_with_demo.groupBy("gender", "user_level").agg(count("*").alias("count"))
region_behavior = user_with_demo.groupBy("region", "user_level").agg(count("*").alias("count")).orderBy(desc("count")).limit(10)
user_retention = filtered_behavior.groupBy("user_id").agg(count(col("behavior_time").cast("date").distinct()).alias("active_days"))
retention_rate = user_retention.withColumn("retention_level", when(col("active_days") >= analysis_days * 0.7, "高留存").when(col("active_days") >= analysis_days * 0.3, "中留存").otherwise("低留存"))
retention_dist = retention_rate.groupBy("retention_level").agg(count("*").alias("user_count"))
hourly_result = hourly_activity.orderBy("hour").toPandas().to_dict('records')
timeslot_result = timeslot_activity.toPandas().to_dict('records')
user_level_result = user_level_dist.orderBy(desc("avg_score")).toPandas().to_dict('records')
age_result = age_behavior.toPandas().to_dict('records')
gender_result = gender_behavior.toPandas().to_dict('records')
region_result = region_behavior.toPandas().to_dict('records')
retention_result = retention_dist.toPandas().to_dict('records')
user_activity_scored.write.format("jdbc").option("url", "jdbc:mysql://localhost:3306/music_db").option("driver", "com.mysql.jdbc.Driver").option("dbtable", "user_behavior_analysis").option("user", "root").option("password", "password").mode("overwrite").save()
return JsonResponse({"status": "success", "hourly_activity": hourly_result, "timeslot_activity": timeslot_result, "user_level_distribution": user_level_result, "age_behavior": age_result, "gender_behavior": gender_result, "region_behavior": region_result, "retention_distribution": retention_result})
六、部分文档展示
七、END
💕💕文末获取源码联系计算机编程果茶熊