前言
- 💖💖作者:计算机程序员小杨
- 💙💙个人简介:我是一名计算机相关专业的从业者,擅长Java、微信小程序、Python、Golang、安卓Android等多个IT方向。会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。热爱技术,喜欢钻研新工具和框架,也乐于通过代码解决实际问题,大家有技术代码这一块的问题可以问我!
- 💛💛想说的话:感谢大家的关注与支持!
- 💕💕文末获取源码联系 计算机程序员小杨
- 💜💜
- 网站实战项目
- 安卓/小程序实战项目
- 大数据实战项目
- 深度学习实战项目
- 计算机毕业设计选题
- 💜💜
一.开发工具简介
- 大数据框架:Hadoop+Spark(本次没用Hive,支持定制)
- 开发语言:Python+Java(两个版本都支持)
- 后端框架:Django+Spring Boot(Spring+SpringMVC+Mybatis)(两个版本都支持)
- 前端:Vue+ElementUI+Echarts+HTML+CSS+JavaScript+jQuery
- 详细技术点:Hadoop、HDFS、Spark、Spark SQL、Pandas、NumPy
- 数据库:MySQL
二.系统内容简介
基于大数据的中国水污染监测数据可视化分析系统是一个综合运用现代大数据技术的环境监测平台,采用Hadoop分布式存储框架和Spark大数据处理引擎作为核心技术架构。系统通过Python语言结合Django后端框架构建稳定的服务层,前端采用Vue框架配合ElementUI组件库和Echarts图表库实现丰富的数据可视化展示。整个系统涵盖用户管理、水污染监测数据管理、污染成因探索分析、水质综合评价分析、核心污染物深度分析以及水质时空分布分析等八大核心功能模块。系统运用Spark SQL进行大规模数据查询处理,结合Pandas和NumPy进行数据科学计算,能够高效处理海量的水质监测数据,为环境保护部门和研究机构提供直观的数据分析结果和决策支持。通过HDFS分布式文件系统确保数据存储的可靠性和扩展性,同时MySQL数据库负责存储结构化的业务数据,形成了完整的大数据处理和分析体系。
三.系统功能演示
环保+大数据双热点:基于大数据的水污染监测数据可视化分析系统引领毕设新潮流
四.系统界面展示
五.系统源码展示
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, max, min, count, when, desc
import pandas as pd
import numpy as np
from django.http import JsonResponse
from django.views import View
import json
spark = SparkSession.builder.appName("WaterPollutionAnalysis").config("spark.sql.adaptive.enabled", "true").getOrCreate()
class PollutionCauseAnalysis(View):
def post(self, request):
data = json.loads(request.body)
start_date = data.get('start_date')
end_date = data.get('end_date')
region = data.get('region')
df = spark.sql(f"SELECT * FROM water_monitoring WHERE monitor_date BETWEEN '{start_date}' AND '{end_date}' AND region = '{region}'")
pollution_sources = df.groupBy('pollution_source').agg(avg('cod').alias('avg_cod'), avg('bod').alias('avg_bod'), avg('ammonia_nitrogen').alias('avg_ammonia'), count('*').alias('sample_count')).orderBy(desc('avg_cod'))
industrial_impact = df.filter(col('pollution_source') == 'industrial').agg(avg('heavy_metals').alias('avg_heavy_metals'), max('ph_value').alias('max_ph')).collect()[0]
agricultural_impact = df.filter(col('pollution_source') == 'agricultural').agg(avg('phosphorus').alias('avg_phosphorus'), avg('nitrogen').alias('avg_nitrogen')).collect()[0]
domestic_impact = df.filter(col('pollution_source') == 'domestic').agg(avg('coliform_bacteria').alias('avg_bacteria'), avg('suspended_solids').alias('avg_suspended')).collect()[0]
correlation_matrix = df.select('cod', 'bod', 'ammonia_nitrogen', 'phosphorus').toPandas().corr()
pollution_trend = df.groupBy('monitor_date').agg(avg('pollution_index').alias('daily_pollution')).orderBy('monitor_date')
seasonal_analysis = df.withColumn('season', when(col('month').isin(3,4,5), 'spring').when(col('month').isin(6,7,8), 'summer').when(col('month').isin(9,10,11), 'autumn').otherwise('winter')).groupBy('season').agg(avg('cod').alias('seasonal_cod'), avg('pollution_index').alias('seasonal_index'))
risk_assessment = df.withColumn('risk_level', when(col('pollution_index') > 80, 'high').when(col('pollution_index') > 50, 'medium').otherwise('low')).groupBy('risk_level').count()
source_contribution = df.groupBy('pollution_source').agg((avg('cod') * count('*')).alias('total_contribution')).withColumn('contribution_rate', col('total_contribution') / df.agg({'cod': 'sum'}).collect()[0][0] * 100)
temporal_pattern = df.groupBy('hour').agg(avg('pollution_index').alias('hourly_avg')).orderBy('hour')
result_data = {'pollution_sources': [row.asDict() for row in pollution_sources.collect()], 'industrial_impact': industrial_impact.asDict(), 'agricultural_impact': agricultural_impact.asDict(), 'domestic_impact': domestic_impact.asDict(), 'correlation_matrix': correlation_matrix.to_dict(), 'pollution_trend': [row.asDict() for row in pollution_trend.collect()], 'seasonal_analysis': [row.asDict() for row in seasonal_analysis.collect()], 'risk_assessment': [row.asDict() for row in risk_assessment.collect()], 'source_contribution': [row.asDict() for row in source_contribution.collect()], 'temporal_pattern': [row.asDict() for row in temporal_pattern.collect()]}
return JsonResponse(result_data)
class WaterQualityComprehensiveEvaluation(View):
def post(self, request):
data = json.loads(request.body)
monitoring_points = data.get('monitoring_points', [])
evaluation_period = data.get('evaluation_period')
df = spark.sql(f"SELECT * FROM water_monitoring WHERE monitoring_point IN ({','.join([f\"'{point}'\" for point in monitoring_points])}) AND evaluation_period = '{evaluation_period}'")
quality_standards = {'cod': 20, 'bod': 4, 'ammonia_nitrogen': 1.0, 'phosphorus': 0.2, 'ph_min': 6, 'ph_max': 9}
standard_compliance = df.select('monitoring_point', when(col('cod') <= quality_standards['cod'], 1).otherwise(0).alias('cod_compliant'), when(col('bod') <= quality_standards['bod'], 1).otherwise(0).alias('bod_compliant'), when(col('ammonia_nitrogen') <= quality_standards['ammonia_nitrogen'], 1).otherwise(0).alias('ammonia_compliant'), when((col('ph_value') >= quality_standards['ph_min']) & (col('ph_value') <= quality_standards['ph_max']), 1).otherwise(0).alias('ph_compliant'))
compliance_rate = standard_compliance.groupBy('monitoring_point').agg(avg('cod_compliant').alias('cod_rate'), avg('bod_compliant').alias('bod_rate'), avg('ammonia_compliant').alias('ammonia_rate'), avg('ph_compliant').alias('ph_rate'))
overall_quality_score = df.withColumn('quality_score', (100 - (col('cod')/quality_standards['cod']*20 + col('bod')/quality_standards['bod']*20 + col('ammonia_nitrogen')/quality_standards['ammonia_nitrogen']*20 + col('phosphorus')/quality_standards['phosphorus']*20))).groupBy('monitoring_point').agg(avg('quality_score').alias('avg_quality_score'))
water_quality_grade = overall_quality_score.withColumn('grade', when(col('avg_quality_score') >= 90, 'I').when(col('avg_quality_score') >= 80, 'II').when(col('avg_quality_score') >= 70, 'III').when(col('avg_quality_score') >= 60, 'IV').otherwise('V'))
temporal_variation = df.groupBy('monitoring_point', 'monitor_date').agg(avg('pollution_index').alias('daily_index')).withColumn('variation', col('daily_index') - avg('daily_index').over())
pollution_severity = df.withColumn('severity_level', when(col('pollution_index') > 100, 'severe').when(col('pollution_index') > 70, 'moderate').when(col('pollution_index') > 40, 'light').otherwise('clean')).groupBy('monitoring_point', 'severity_level').count()
improvement_trend = df.groupBy('monitoring_point').agg((min('pollution_index') - max('pollution_index')).alias('improvement_value')).withColumn('trend_direction', when(col('improvement_value') > 5, 'improving').when(col('improvement_value') < -5, 'deteriorating').otherwise('stable'))
ecological_impact_score = df.withColumn('eco_impact', col('cod')*0.3 + col('ammonia_nitrogen')*0.4 + col('phosphorus')*0.3).groupBy('monitoring_point').agg(avg('eco_impact').alias('avg_eco_impact'))
comprehensive_ranking = water_quality_grade.join(ecological_impact_score, 'monitoring_point').orderBy(desc('avg_quality_score'))
result_data = {'compliance_rate': [row.asDict() for row in compliance_rate.collect()], 'quality_grades': [row.asDict() for row in water_quality_grade.collect()], 'temporal_variation': [row.asDict() for row in temporal_variation.collect()], 'pollution_severity': [row.asDict() for row in pollution_severity.collect()], 'improvement_trend': [row.asDict() for row in improvement_trend.collect()], 'ecological_impact': [row.asDict() for row in ecological_impact_score.collect()], 'comprehensive_ranking': [row.asDict() for row in comprehensive_ranking.collect()]}
return JsonResponse(result_data)
class CorePollutantDepthAnalysis(View):
def post(self, request):
data = json.loads(request.body)
pollutant_type = data.get('pollutant_type')
analysis_region = data.get('analysis_region')
time_range = data.get('time_range')
df = spark.sql(f"SELECT * FROM water_monitoring WHERE region = '{analysis_region}' AND monitor_date BETWEEN '{time_range['start']}' AND '{time_range['end']}'")
pollutant_concentration_stats = df.agg(avg(pollutant_type).alias('avg_concentration'), max(pollutant_type).alias('max_concentration'), min(pollutant_type).alias('min_concentration'), count(pollutant_type).alias('sample_size'))
concentration_distribution = df.withColumn('concentration_range', when(col(pollutant_type) <= 10, '0-10').when(col(pollutant_type) <= 30, '10-30').when(col(pollutant_type) <= 50, '30-50').otherwise('50+')).groupBy('concentration_range').count()
exceeding_standard_analysis = df.withColumn('exceeding_times', when(col(pollutant_type) > 20, col(pollutant_type) / 20).otherwise(0)).filter(col('exceeding_times') > 1).groupBy('monitoring_point').agg(avg('exceeding_times').alias('avg_exceeding_times'), count('*').alias('exceeding_count'))
pollutant_source_tracking = df.groupBy('pollution_source').agg(avg(pollutant_type).alias('source_avg_concentration'), count('*').alias('source_sample_count')).orderBy(desc('source_avg_concentration'))
seasonal_pollutant_pattern = df.withColumn('season', when(col('month').isin(3,4,5), 'spring').when(col('month').isin(6,7,8), 'summer').when(col('month').isin(9,10,11), 'autumn').otherwise('winter')).groupBy('season').agg(avg(pollutant_type).alias('seasonal_avg'), max(pollutant_type).alias('seasonal_max'))
daily_variation_pattern = df.groupBy('hour').agg(avg(pollutant_type).alias('hourly_avg_concentration')).orderBy('hour')
pollutant_correlation_with_factors = df.select(pollutant_type, 'temperature', 'rainfall', 'wind_speed', 'industrial_activity').toPandas().corr()[pollutant_type].to_dict()
pollution_hotspot_identification = df.filter(col(pollutant_type) > df.agg({pollutant_type: 'avg'}).collect()[0][0] * 1.5).groupBy('monitoring_point').agg(avg(pollutant_type).alias('hotspot_avg_concentration'), count('*').alias('hotspot_frequency'))
pollutant_removal_efficiency = df.filter(col('treatment_status') == 'treated').withColumn('removal_rate', (col(f'{pollutant_type}_before') - col(f'{pollutant_type}_after')) / col(f'{pollutant_type}_before') * 100).groupBy('treatment_method').agg(avg('removal_rate').alias('avg_removal_rate'))
long_term_trend_analysis = df.groupBy('year', 'month').agg(avg(pollutant_type).alias('monthly_avg')).orderBy('year', 'month')
health_risk_assessment = df.withColumn('health_risk_index', when(col(pollutant_type) > 50, 'high').when(col(pollutant_type) > 20, 'medium').otherwise('low')).groupBy('health_risk_index').count()
result_data = {'concentration_stats': pollutant_concentration_stats.collect()[0].asDict(), 'concentration_distribution': [row.asDict() for row in concentration_distribution.collect()], 'exceeding_analysis': [row.asDict() for row in exceeding_standard_analysis.collect()], 'source_tracking': [row.asDict() for row in pollutant_source_tracking.collect()], 'seasonal_pattern': [row.asDict() for row in seasonal_pollutant_pattern.collect()], 'daily_variation': [row.asDict() for row in daily_variation_pattern.collect()], 'correlation_factors': pollutant_correlation_with_factors, 'pollution_hotspots': [row.asDict() for row in pollution_hotspot_identification.collect()], 'removal_efficiency': [row.asDict() for row in pollutant_removal_efficiency.collect()], 'trend_analysis': [row.asDict() for row in long_term_trend_analysis.collect()], 'health_risk': [row.asDict() for row in health_risk_assessment.collect()]}
return JsonResponse(result_data)