💖💖作者:计算机毕业设计杰瑞 💙💙个人简介:曾长期从事计算机专业培训教学,本人也热爱上课教学,语言擅长Java、微信小程序、Python、Golang、安卓Android等,开发项目包括大数据、深度学习、网站、小程序、安卓、算法。平常会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 深度学校实战项目 计算机毕业设计选题推荐
基于大数据的豆瓣高分电影数据可视化分析系统介绍
基于大数据的豆瓣高分电影数据可视化分析系统是一个集数据采集、存储、处理和可视化展示于一体的综合性分析平台。该系统采用Hadoop分布式文件系统作为底层存储架构,结合Spark大数据计算引擎实现海量电影数据的高效处理和分析。系统后端基于Django框架构建,前端采用Vue框架配合ElementUI组件库和Echarts可视化图表库,为用户提供直观友好的交互界面。系统核心功能涵盖电影特征可视化分析、电影类型内容分析、电影制作产业分析、影人影响力可视化分析以及观众平台覆盖率分析等多个维度。通过运用Spark SQL、Pandas、NumPy等数据处理技术,系统能够从豆瓣电影数据中挖掘出评分趋势、类型分布、制作成本与票房关系、导演演员影响力等有价值的信息,并以多样化的图表形式进行展示。系统还提供个人信息管理、密码修改等基础功能,确保用户数据安全。整个系统架构清晰、功能完善,为电影行业研究和爱好者提供了一个专业的数据分析工具。
基于大数据的豆瓣高分电影数据可视化分析系统演示视频
基于大数据的豆瓣高分电影数据可视化分析系统演示图片
基于大数据的豆瓣高分电影数据可视化分析系统代码展示
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, count, col, desc, asc, regexp_extract, split, explode, collect_list
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType
import pandas as pd
import json
def movie_feature_analysis(spark, movie_data_path):
df = spark.read.option("header", "true").csv(movie_data_path)
df = df.withColumn("rating", col("rating").cast(FloatType()))
df = df.withColumn("year", regexp_extract(col("release_date"), r"(\d{4})", 1).cast(IntegerType()))
rating_stats = df.groupBy("year").agg(avg("rating").alias("avg_rating"), count("*").alias("movie_count")).orderBy("year")
high_rating_movies = df.filter(col("rating") >= 8.0).select("title", "rating", "year", "genre").orderBy(desc("rating"))
rating_distribution = df.groupBy("rating").count().orderBy("rating")
yearly_trend = rating_stats.collect()
top_movies = high_rating_movies.limit(50).collect()
rating_dist = rating_distribution.collect()
result = {
"yearly_trend": [{"year": row.year, "avg_rating": round(row.avg_rating, 2), "count": row.movie_count} for row in yearly_trend],
"top_movies": [{"title": row.title, "rating": row.rating, "year": row.year, "genre": row.genre} for row in top_movies],
"rating_distribution": [{"rating": row.rating, "count": row.count} for row in rating_dist]
}
return result
def movie_genre_analysis(spark, movie_data_path):
df = spark.read.option("header", "true").csv(movie_data_path)
df = df.withColumn("rating", col("rating").cast(FloatType()))
genre_df = df.select("title", "rating", explode(split(col("genre"), ",")).alias("single_genre"))
genre_df = genre_df.withColumn("single_genre", regexp_extract(col("single_genre"), r"([^/]+)", 1))
genre_stats = genre_df.groupBy("single_genre").agg(
count("*").alias("movie_count"),
avg("rating").alias("avg_rating")
).filter(col("movie_count") >= 10)
genre_popularity = genre_stats.orderBy(desc("movie_count"))
genre_quality = genre_stats.orderBy(desc("avg_rating"))
top_movies_by_genre = genre_df.filter(col("rating") >= 8.0).groupBy("single_genre").agg(
collect_list("title").alias("top_movies"),
count("*").alias("high_rating_count")
).orderBy(desc("high_rating_count"))
popularity_result = genre_popularity.collect()
quality_result = genre_quality.collect()
top_genre_movies = top_movies_by_genre.collect()
result = {
"genre_popularity": [{"genre": row.single_genre, "count": row.movie_count, "avg_rating": round(row.avg_rating, 2)} for row in popularity_result],
"genre_quality": [{"genre": row.single_genre, "avg_rating": round(row.avg_rating, 2), "count": row.movie_count} for row in quality_result],
"top_movies_by_genre": [{"genre": row.single_genre, "movies": row.top_movies[:10], "high_rating_count": row.high_rating_count} for row in top_genre_movies]
}
return result
def celebrity_influence_analysis(spark, movie_data_path):
df = spark.read.option("header", "true").csv(movie_data_path)
df = df.withColumn("rating", col("rating").cast(FloatType()))
director_df = df.select("title", "rating", explode(split(col("director"), ",")).alias("single_director"))
director_df = director_df.withColumn("single_director", regexp_extract(col("single_director"), r"([^/]+)", 1))
director_stats = director_df.groupBy("single_director").agg(
count("*").alias("movie_count"),
avg("rating").alias("avg_rating")
).filter(col("movie_count") >= 3)
actor_df = df.select("title", "rating", explode(split(col("actors"), ",")).alias("single_actor"))
actor_df = actor_df.withColumn("single_actor", regexp_extract(col("single_actor"), r"([^/]+)", 1))
actor_stats = actor_df.groupBy("single_actor").agg(
count("*").alias("movie_count"),
avg("rating").alias("avg_rating")
).filter(col("movie_count") >= 5)
top_directors = director_stats.orderBy(desc("avg_rating"), desc("movie_count")).limit(20)
prolific_directors = director_stats.orderBy(desc("movie_count")).limit(20)
top_actors = actor_stats.orderBy(desc("avg_rating"), desc("movie_count")).limit(30)
prolific_actors = actor_stats.orderBy(desc("movie_count")).limit(30)
director_influence = director_df.filter(col("rating") >= 8.5).groupBy("single_director").count().orderBy(desc("count"))
actor_influence = actor_df.filter(col("rating") >= 8.5).groupBy("single_actor").count().orderBy(desc("count"))
top_dir_result = top_directors.collect()
prolific_dir_result = prolific_directors.collect()
top_act_result = top_actors.collect()
prolific_act_result = prolific_actors.collect()
dir_influence_result = director_influence.collect()
act_influence_result = actor_influence.collect()
result = {
"top_directors": [{"name": row.single_director, "avg_rating": round(row.avg_rating, 2), "movie_count": row.movie_count} for row in top_dir_result],
"prolific_directors": [{"name": row.single_director, "movie_count": row.movie_count, "avg_rating": round(row.avg_rating, 2)} for row in prolific_dir_result],
"top_actors": [{"name": row.single_actor, "avg_rating": round(row.avg_rating, 2), "movie_count": row.movie_count} for row in top_act_result],
"prolific_actors": [{"name": row.single_actor, "movie_count": row.movie_count, "avg_rating": round(row.avg_rating, 2)} for row in prolific_act_result],
"director_influence": [{"name": row.single_director, "high_rating_movies": row.count} for row in dir_influence_result],
"actor_influence": [{"name": row.single_actor, "high_rating_movies": row.count} for row in act_influence_result]
}
return result
spark = SparkSession.builder.appName("DoubanMovieAnalysis").config("spark.sql.adaptive.enabled", "true").config("spark.sql.adaptive.coalescePartitions.enabled", "true").getOrCreate()
基于大数据的豆瓣高分电影数据可视化分析系统文档展示
💖💖作者:计算机毕业设计杰瑞 💙💙个人简介:曾长期从事计算机专业培训教学,本人也热爱上课教学,语言擅长Java、微信小程序、Python、Golang、安卓Android等,开发项目包括大数据、深度学习、网站、小程序、安卓、算法。平常会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 深度学校实战项目 计算机毕业设计选题推荐