基于大数据的快手平台用户活跃度分析系统 | 还有3个月答辩?不会Hadoop+Spark大数据技术,快手用户活跃度分析系统可能让你延毕

61 阅读6分钟

💖💖作者:计算机毕业设计江挽 💙💙个人简介:曾长期从事计算机专业培训教学,本人也热爱上课教学,语言擅长Java、微信小程序、Python、Golang、安卓Android等,开发项目包括大数据、深度学习、网站、小程序、安卓、算法。平常会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 深度学习实战项目

基于大数据的快手平台用户活跃度分析系统介绍

快手平台用户活跃度分析系统是一套基于大数据技术的用户行为分析平台,采用Hadoop+Spark分布式计算框架对快手平台用户数据进行深度挖掘和分析。系统通过Django后端框架和Vue前端技术构建用户友好的交互界面,结合MySQL数据库存储结构化数据,实现了用户活跃度信息管理、用户行为模式分析、地理与学校维度分析等核心功能。系统运用Spark SQL进行大规模数据查询和统计分析,通过Pandas和NumPy进行数据预处理和科学计算,利用Echarts图表库实现数据可视化展示。整个系统架构支持海量用户数据的实时处理和分析,能够从多个维度深入洞察快手平台用户的活跃行为规律,为平台运营决策提供数据支撑。系统界面简洁直观,操作便捷,通过用户画像分析功能可以精准刻画用户特征,帮助理解用户群体的行为偏好和活跃特点。

基于大数据的快手平台用户活跃度分析系统演示视频

演示视频

基于大数据的快手平台用户活跃度分析系统演示图片

在这里插入图片描述 在这里插入图片描述转存失败,建议直接上传图片文件 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述

基于大数据的快手平台用户活跃度分析系统代码展示

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, desc, when, sum as spark_sum
import pandas as pd
import numpy as np
from django.http import JsonResponse
from django.views.decorators.http import require_http_methods
from django.views.decorators.csrf import csrf_exempt
import json
from datetime import datetime, timedelta

spark = SparkSession.builder.appName("KuaishouUserActivityAnalysis").config("spark.sql.adaptive.enabled", "true").config("spark.sql.adaptive.coalescePartitions.enabled", "true").getOrCreate()

@csrf_exempt
@require_http_methods(["GET", "POST"])
def user_activity_analysis(request):
    if request.method == "GET":
        user_id = request.GET.get('user_id')
        days = int(request.GET.get('days', 30))
        
        user_activity_df = spark.sql(f"""
            SELECT user_id, activity_date, activity_type, activity_count, duration_minutes
            FROM user_activity_logs 
            WHERE user_id = '{user_id}' 
            AND activity_date >= date_sub(current_date(), {days})
        """)
        
        daily_activity = user_activity_df.groupBy("activity_date").agg(
            spark_sum("activity_count").alias("total_activities"),
            avg("duration_minutes").alias("avg_duration"),
            count("activity_type").alias("activity_types")
        ).orderBy(desc("activity_date"))
        
        activity_pattern = user_activity_df.groupBy("activity_type").agg(
            spark_sum("activity_count").alias("type_count"),
            avg("duration_minutes").alias("avg_type_duration")
        ).orderBy(desc("type_count"))
        
        peak_hours = spark.sql(f"""
            SELECT hour(activity_timestamp) as hour, count(*) as activity_freq
            FROM user_activity_logs 
            WHERE user_id = '{user_id}' 
            AND activity_date >= date_sub(current_date(), {days})
            GROUP BY hour(activity_timestamp)
            ORDER BY activity_freq DESC
            LIMIT 5
        """)
        
        user_score = user_activity_df.withColumn("weighted_score", 
            when(col("activity_type") == "like", col("activity_count") * 1.0)
            .when(col("activity_type") == "comment", col("activity_count") * 2.0)
            .when(col("activity_type") == "share", col("activity_count") * 3.0)
            .when(col("activity_type") == "create", col("activity_count") * 5.0)
            .otherwise(col("activity_count") * 0.5)
        ).agg(spark_sum("weighted_score").alias("total_score")).collect()[0]["total_score"]
        
        daily_data = [{"date": row["activity_date"].strftime("%Y-%m-%d"), 
                      "activities": row["total_activities"], 
                      "duration": round(row["avg_duration"], 2)} 
                     for row in daily_activity.collect()]
        pattern_data = [{"type": row["activity_type"], 
                        "count": row["type_count"], 
                        "avg_duration": round(row["avg_type_duration"], 2)} 
                       for row in activity_pattern.collect()]
        peak_data = [{"hour": row["hour"], "frequency": row["activity_freq"]} 
                    for row in peak_hours.collect()]
        
        return JsonResponse({
            "user_activity_score": round(user_score, 2) if user_score else 0,
            "daily_activities": daily_data,
            "activity_patterns": pattern_data,
            "peak_hours": peak_data,
            "analysis_period": days
        })

@csrf_exempt
@require_http_methods(["GET"])
def behavior_pattern_analysis(request):
    region = request.GET.get('region', 'all')
    age_group = request.GET.get('age_group', 'all')
    time_period = int(request.GET.get('time_period', 7))
    
    base_query = f"""
        SELECT u.user_id, u.region, u.age_group, u.school_type,
               a.activity_type, a.activity_count, a.duration_minutes,
               a.activity_date, hour(a.activity_timestamp) as activity_hour
        FROM users u JOIN user_activity_logs a ON u.user_id = a.user_id
        WHERE a.activity_date >= date_sub(current_date(), {time_period})
    """
    
    if region != 'all':
        base_query += f" AND u.region = '{region}'"
    if age_group != 'all':
        base_query += f" AND u.age_group = '{age_group}'"
    
    behavior_df = spark.sql(base_query)
    
    regional_patterns = behavior_df.groupBy("region", "activity_type").agg(
        avg("activity_count").alias("avg_activity"),
        avg("duration_minutes").alias("avg_duration"),
        count("user_id").alias("user_count")
    ).orderBy("region", desc("avg_activity"))
    
    time_patterns = behavior_df.groupBy("activity_hour").agg(
        spark_sum("activity_count").alias("total_activities"),
        count("user_id").distinct().alias("active_users")
    ).orderBy("activity_hour")
    
    age_behavior = behavior_df.groupBy("age_group", "activity_type").agg(
        avg("activity_count").alias("avg_count"),
        spark_sum("duration_minutes").alias("total_duration")
    ).orderBy("age_group", desc("avg_count"))
    
    school_influence = behavior_df.filter(col("school_type").isNotNull()).groupBy("school_type").agg(
        avg("activity_count").alias("avg_activity"),
        avg("duration_minutes").alias("avg_duration"),
        count("user_id").distinct().alias("student_count")
    ).orderBy(desc("avg_activity"))
    
    engagement_score = behavior_df.withColumn("engagement_score",
        (col("activity_count") * 0.3 + col("duration_minutes") * 0.7)
    ).groupBy("region").agg(avg("engagement_score").alias("avg_engagement")).orderBy(desc("avg_engagement"))
    
    regional_data = [{"region": row["region"], "activity_type": row["activity_type"], 
                     "avg_activity": round(row["avg_activity"], 2), 
                     "avg_duration": round(row["avg_duration"], 2)} 
                    for row in regional_patterns.collect()]
    
    time_data = [{"hour": row["activity_hour"], 
                 "activities": row["total_activities"], 
                 "users": row["active_users"]} 
                for row in time_patterns.collect()]
    
    age_data = [{"age_group": row["age_group"], "activity_type": row["activity_type"], 
                "avg_count": round(row["avg_count"], 2), 
                "total_duration": row["total_duration"]} 
               for row in age_behavior.collect()]
    
    return JsonResponse({
        "regional_patterns": regional_data,
        "hourly_patterns": time_data,
        "age_behaviors": age_data,
        "school_influence": [{"school_type": row["school_type"], 
                             "avg_activity": round(row["avg_activity"], 2)} 
                            for row in school_influence.collect()],
        "engagement_rankings": [{"region": row["region"], 
                                "engagement": round(row["avg_engagement"], 2)} 
                               for row in engagement_score.collect()]
    })

@csrf_exempt
@require_http_methods(["POST"])
def user_profile_analysis(request):
    data = json.loads(request.body)
    user_ids = data.get('user_ids', [])
    analysis_type = data.get('analysis_type', 'comprehensive')
    
    if not user_ids:
        return JsonResponse({"error": "用户ID列表不能为空"}, status=400)
    
    user_list = "','".join(user_ids)
    
    profile_df = spark.sql(f"""
        SELECT u.user_id, u.region, u.age_group, u.gender, u.school_type,
               u.registration_date, u.follower_count, u.following_count,
               a.activity_type, sum(a.activity_count) as total_count,
               avg(a.duration_minutes) as avg_duration,
               count(distinct a.activity_date) as active_days
        FROM users u 
        LEFT JOIN user_activity_logs a ON u.user_id = a.user_id
        WHERE u.user_id IN ('{user_list}')
        AND a.activity_date >= date_sub(current_date(), 90)
        GROUP BY u.user_id, u.region, u.age_group, u.gender, u.school_type,
                 u.registration_date, u.follower_count, u.following_count, a.activity_type
    """)
    
    activity_preference = profile_df.groupBy("user_id").pivot("activity_type").sum("total_count").fillna(0)
    
    user_segments = profile_df.withColumn("activity_level",
        when(col("total_count") >= 100, "high")
        .when(col("total_count") >= 50, "medium")
        .otherwise("low")
    ).withColumn("engagement_type",
        when(col("avg_duration") >= 30, "deep_engagement")
        .when(col("avg_duration") >= 15, "moderate_engagement")
        .otherwise("light_engagement")
    )
    
    social_influence = profile_df.withColumn("influence_score",
        (col("follower_count") * 0.6 + col("following_count") * 0.4) / 1000
    ).select("user_id", "influence_score").orderBy(desc("influence_score"))
    
    geographic_clustering = profile_df.groupBy("region").agg(
        count("user_id").alias("user_count"),
        avg("total_count").alias("avg_activity"),
        avg("follower_count").alias("avg_followers")
    ).orderBy(desc("user_count"))
    
    retention_analysis = spark.sql(f"""
        SELECT user_id, 
               datediff(current_date(), registration_date) as days_since_reg,
               active_days,
               (active_days * 1.0 / datediff(current_date(), registration_date)) as retention_rate
        FROM ({profile_df.select("user_id", "registration_date", "active_days").distinct().createOrReplaceTempView("temp_retention"); "SELECT * FROM temp_retention"})
        WHERE datediff(current_date(), registration_date) > 0
    """)
    
    profile_summary = []
    for user_id in user_ids:
        user_data = profile_df.filter(col("user_id") == user_id).collect()
        if user_data:
            activities = {row["activity_type"]: {"count": row["total_count"], 
                         "avg_duration": round(row["avg_duration"], 2)} 
                         for row in user_data if row["activity_type"]}
            user_info = user_data[0]
            profile_summary.append({
                "user_id": user_id,
                "region": user_info["region"],
                "age_group": user_info["age_group"],
                "activities": activities,
                "active_days": user_info["active_days"],
                "social_metrics": {
                    "followers": user_info["follower_count"],
                    "following": user_info["following_count"]
                }
            })
    
    return JsonResponse({
        "user_profiles": profile_summary,
        "geographic_distribution": [{"region": row["region"], 
                                   "user_count": row["user_count"], 
                                   "avg_activity": round(row["avg_activity"], 2)} 
                                  for row in geographic_clustering.collect()],
        "influence_rankings": [{"user_id": row["user_id"], 
                               "influence_score": round(row["influence_score"], 2)} 
                              for row in social_influence.limit(10).collect()],
        "retention_metrics": [{"user_id": row["user_id"], 
                              "retention_rate": round(row["retention_rate"], 3)} 
                             for row in retention_analysis.collect()]
    })

基于大数据的快手平台用户活跃度分析系统文档展示

在这里插入图片描述

💖💖作者:计算机毕业设计江挽 💙💙个人简介:曾长期从事计算机专业培训教学,本人也热爱上课教学,语言擅长Java、微信小程序、Python、Golang、安卓Android等,开发项目包括大数据、深度学习、网站、小程序、安卓、算法。平常会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我! 💛💛想说的话:感谢大家的关注与支持! 💜💜 网站实战项目 安卓/小程序实战项目 大数据实战项目 深度学习实战项目