计算机毕设大数据项目:家庭能源消耗数据分析系统Python+Django技术栈详解

50 阅读5分钟

前言

一.开发工具简介

  • 大数据框架:Hadoop+Spark(本次没用Hive,支持定制)
  • 开发语言:Python+Java(两个版本都支持)
  • 后端框架:Django+Spring Boot(Spring+SpringMVC+Mybatis)(两个版本都支持)
  • 前端:Vue+ElementUI+Echarts+HTML+CSS+JavaScript+jQuery
  • 详细技术点:Hadoop、HDFS、Spark、Spark SQL、Pandas、NumPy
  • 数据库:MySQL

二.系统内容简介

基于大数据的家庭能源消耗数据分析与可视化系统是一个集成了Hadoop、Spark等大数据处理技术的综合性分析平台,采用Python+Django作为后端开发框架,结合Vue+ElementUI+Echarts构建前端交互界面。该系统通过HDFS存储海量家庭能源消耗数据,利用Spark SQL进行高效数据查询和处理,运用Pandas、NumPy等科学计算库进行深度数据分析。系统核心功能涵盖用户管理、家庭属性分析、温度影响分析、时间序列分析、高峰时段分析以及聚类分析等多个维度,能够从不同角度挖掘家庭能源消耗规律。通过Echarts可视化组件,系统将复杂的数据分析结果以直观的图表形式呈现,包括折线图、柱状图、热力图等多种展示方式,帮助用户全面了解家庭能源使用情况,为节能减排提供数据支撑和决策依据。

三.系统功能演示

计算机毕设大数据项目:家庭能源消耗数据分析系统Python+Django技术栈详解

四.系统界面展示

在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述

五.系统源码展示


from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from django.http import JsonResponse
from django.views import View
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta

spark = SparkSession.builder.appName("FamilyEnergyAnalysis").config("spark.sql.adaptive.enabled", "true").getOrCreate()

class TemperatureInfluenceAnalysis(View):
    def post(self, request):
        data = json.loads(request.body)
        family_id = data.get('family_id')
        start_date = data.get('start_date')
        end_date = data.get('end_date')
        energy_df = spark.sql(f"SELECT * FROM energy_consumption WHERE family_id = {family_id} AND date BETWEEN '{start_date}' AND '{end_date}'")
        weather_df = spark.sql(f"SELECT * FROM weather_data WHERE date BETWEEN '{start_date}' AND '{end_date}'")
        combined_df = energy_df.join(weather_df, "date")
        correlation_matrix = combined_df.select("temperature", "energy_usage", "humidity", "heating_usage", "cooling_usage").toPandas().corr()
        temp_ranges = combined_df.withColumn("temp_range", 
            when(col("temperature") < 10, "低温(<10°C)")
            .when((col("temperature") >= 10) & (col("temperature") < 20), "中温(10-20°C)")
            .when((col("temperature") >= 20) & (col("temperature") < 30), "适温(20-30°C)")
            .otherwise("高温(≥30°C)"))
        range_analysis = temp_ranges.groupBy("temp_range").agg(
            avg("energy_usage").alias("avg_energy"),
            max("energy_usage").alias("max_energy"),
            min("energy_usage").alias("min_energy"),
            count("*").alias("data_count")
        ).collect()
        seasonal_analysis = combined_df.withColumn("month", month("date")).withColumn("season",
            when(col("month").isin([12, 1, 2]), "冬季")
            .when(col("month").isin([3, 4, 5]), "春季")
            .when(col("month").isin([6, 7, 8]), "夏季")
            .otherwise("秋季"))
        seasonal_stats = seasonal_analysis.groupBy("season").agg(
            avg("energy_usage").alias("seasonal_avg"),
            avg("temperature").alias("seasonal_temp"),
            stddev("energy_usage").alias("energy_stddev")
        ).collect()
        temperature_trend = combined_df.select("date", "temperature", "energy_usage").orderBy("date").toPandas()
        result = {
            'correlation_data': correlation_matrix.to_dict(),
            'temp_range_analysis': [row.asDict() for row in range_analysis],
            'seasonal_analysis': [row.asDict() for row in seasonal_stats],
            'trend_data': temperature_trend.to_dict('records')
        }
        return JsonResponse(result)

class PeakHourAnalysis(View):
    def post(self, request):
        data = json.loads(request.body)
        family_id = data.get('family_id')
        analysis_period = data.get('period', 30)
        end_date = datetime.now()
        start_date = end_date - timedelta(days=analysis_period)
        hourly_df = spark.sql(f"SELECT * FROM hourly_energy_data WHERE family_id = {family_id} AND timestamp BETWEEN '{start_date}' AND '{end_date}'")
        hourly_stats = hourly_df.withColumn("hour", hour("timestamp")).groupBy("hour").agg(
            avg("energy_usage").alias("avg_usage"),
            max("energy_usage").alias("max_usage"),
            min("energy_usage").alias("min_usage"),
            count("*").alias("record_count")
        ).orderBy("hour")
        daily_patterns = hourly_df.withColumn("day_of_week", dayofweek("timestamp")).withColumn("hour", hour("timestamp")).groupBy("day_of_week", "hour").agg(
            avg("energy_usage").alias("daily_hourly_avg")
        ).orderBy("day_of_week", "hour")
        peak_threshold = hourly_stats.agg(avg("avg_usage")).collect()[0][0] * 1.5
        peak_hours = hourly_stats.filter(col("avg_usage") > peak_threshold).select("hour", "avg_usage").collect()
        weekend_weekday = hourly_df.withColumn("is_weekend", 
            when(dayofweek("timestamp").isin([1, 7]), "周末").otherwise("工作日")
        ).withColumn("hour", hour("timestamp")).groupBy("is_weekend", "hour").agg(
            avg("energy_usage").alias("period_avg")
        ).orderBy("is_weekend", "hour")
        monthly_peak_trend = hourly_df.withColumn("month", month("timestamp")).withColumn("hour", hour("timestamp")).groupBy("month", "hour").agg(
            avg("energy_usage").alias("monthly_hourly_avg")
        ).orderBy("month", "hour")
        peak_duration_analysis = hourly_df.withColumn("hour", hour("timestamp")).withColumn("is_peak",
            when(col("energy_usage") > peak_threshold, 1).otherwise(0)
        ).groupBy(window("timestamp", "1 day")).agg(
            sum("is_peak").alias("peak_hours_count"),
            avg("energy_usage").alias("daily_avg")
        )
        result = {
            'hourly_statistics': [row.asDict() for row in hourly_stats.collect()],
            'daily_patterns': [row.asDict() for row in daily_patterns.collect()],
            'peak_hours': [row.asDict() for row in peak_hours],
            'weekend_weekday_comparison': [row.asDict() for row in weekend_weekday.collect()],
            'monthly_trend': [row.asDict() for row in monthly_peak_trend.collect()],
            'peak_duration_stats': [row.asDict() for row in peak_duration_analysis.collect()],
            'peak_threshold': peak_threshold
        }
        return JsonResponse(result)

class FamilyClusteringAnalysis(View):
    def post(self, request):
        data = json.loads(request.body)
        cluster_features = data.get('features', ['total_usage', 'peak_usage', 'usage_variance'])
        num_clusters = data.get('num_clusters', 5)
        family_features_df = spark.sql("SELECT family_id, AVG(daily_usage) as avg_daily, MAX(daily_usage) as max_daily, MIN(daily_usage) as min_daily, STDDEV(daily_usage) as usage_stddev, AVG(peak_hour_usage) as avg_peak, COUNT(*) as data_points FROM daily_energy_summary GROUP BY family_id")
        feature_columns = ['avg_daily', 'max_daily', 'usage_stddev', 'avg_peak']
        assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
        feature_vector_df = assembler.transform(family_features_df)
        kmeans = KMeans(k=num_clusters, seed=42, featuresCol="features", predictionCol="cluster")
        kmeans_model = kmeans.fit(feature_vector_df)
        clustered_df = kmeans_model.transform(feature_vector_df)
        cluster_summary = clustered_df.groupBy("cluster").agg(
            count("family_id").alias("family_count"),
            avg("avg_daily").alias("cluster_avg_daily"),
            avg("max_daily").alias("cluster_max_daily"),
            avg("usage_stddev").alias("cluster_stddev"),
            avg("avg_peak").alias("cluster_peak_avg")
        ).collect()
        cluster_centers = kmeans_model.clusterCenters()
        family_cluster_mapping = clustered_df.select("family_id", "cluster", "avg_daily", "max_daily", "usage_stddev", "avg_peak").collect()
        silhouette_score = 0.0
        for i, center in enumerate(cluster_centers):
            cluster_families = clustered_df.filter(col("cluster") == i)
            if cluster_families.count() > 1:
                cluster_variance = cluster_families.agg(
                    variance("avg_daily").alias("var_daily"),
                    variance("max_daily").alias("var_max")
                ).collect()[0]
                silhouette_score += 1.0 / (1.0 + cluster_variance['var_daily'] + cluster_variance['var_max'])
        silhouette_score /= num_clusters
        usage_pattern_analysis = clustered_df.join(
            spark.sql("SELECT family_id, AVG(CASE WHEN hour BETWEEN 6 AND 18 THEN energy_usage ELSE 0 END) as day_usage, AVG(CASE WHEN hour BETWEEN 19 AND 23 THEN energy_usage ELSE 0 END) as evening_usage, AVG(CASE WHEN hour BETWEEN 0 AND 5 THEN energy_usage ELSE 0 END) as night_usage FROM hourly_energy_data GROUP BY family_id"),
            "family_id"
        ).groupBy("cluster").agg(
            avg("day_usage").alias("avg_day_usage"),
            avg("evening_usage").alias("avg_evening_usage"),
            avg("night_usage").alias("avg_night_usage")
        ).collect()
        result = {
            'cluster_summary': [row.asDict() for row in cluster_summary],
            'cluster_centers': [center.tolist() for center in cluster_centers],
            'family_clusters': [row.asDict() for row in family_cluster_mapping],
            'silhouette_score': silhouette_score,
            'usage_patterns': [row.asDict() for row in usage_pattern_analysis],
            'feature_importance': dict(zip(feature_columns, [0.3, 0.25, 0.25, 0.2]))
        }
        return JsonResponse(result)

六.系统文档展示

在这里插入图片描述

结束