一、个人简介
- 💖💖作者:计算机编程果茶熊
- 💙💙个人简介:曾长期从事计算机专业培训教学,担任过编程老师,同时本人也热爱上课教学,擅长Java、微信小程序、Python、Golang、安卓Android等多个IT方向。会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。平常喜欢分享一些自己开发中遇到的问题的解决办法,也喜欢交流技术,大家有技术代码这一块的问题可以问我!
- 💛💛想说的话:感谢大家的关注与支持!
- 💜💜
- 网站实战项目
- 安卓/小程序实战项目
- 大数据实战项目
- 💕💕文末获取源码联系计算机编程果茶熊
二、系统介绍
- 大数据框架:Hadoop+Spark(本次没用Hive,支持定制)
- 开发语言:Python+Java(两个版本都支持)
- 后端框架:Django+Spring Boot(Spring+SpringMVC+Mybatis)(两个版本都支持)
- 前端:Vue+ElementUI+Echarts+HTML+CSS+JavaScript+jQuery
- 详细技术点:Hadoop、HDFS、Spark、Spark SQL、Pandas、NumPy
- 数据库:MySQL
- 《基于大数据的城市空气污染数据分析系统》是一套基于大数据技术的综合性分析平台,核心采用Hadoop和Spark构建数据处理框架,通过Python和Java双语言支持实现灵活开发。系统后端采用Django和Spring Boot(Spring+SpringMVC+Mybatis)框架,前端整合Vue、ElementUI和Echarts等技术栈,实现了丰富的数据可视化展示。系统功能模块包括系统首页、个人中心、用户管理以及四大核心分析模块:空气质量评价可视化分析、气象因素影响可视化分析、污染物关联性可视化分析和时空分布特征可视化分析。通过Spark SQL、Pandas和NumPy等技术,系统能够高效处理和分析大规模空气污染数据,从多维度揭示污染物分布规律、气象条件影响机制以及污染物之间的相互关系,为环境监测、污染防治和政策制定提供数据支撑。HDFS作为底层存储,保证了数据的高可用性和可扩展性,而MySQL数据库则用于存储结构化数据和分析结果,整体架构既满足了大数据处理的需求,又保证了系统的稳定性和响应速度。
三、基于大数据的城市空气污染数据分析系统-视频解说
担心毕设技术含量不够?《城市空气污染数据分析系统》集成Hadoop+Spark助你技术进阶
四、基于大数据的城市空气污染数据分析系统-功能展示
五、基于大数据的城市空气污染数据分析系统-代码展示
# 功能1: 空气质量评价可视化分析
def analyze_air_quality(city_id, start_date, end_date):
# 从HDFS读取原始空气质量数据
spark = SparkSession.builder.appName("AirQualityAnalysis").getOrCreate()
air_quality_data = spark.read.parquet(f"hdfs:///air_quality/city_{city_id}")
# 过滤时间范围
filtered_data = air_quality_data.filter(
(col("date") >= start_date) & (col("date") <= end_date)
)
# 计算AQI指数及其构成
aqi_data = filtered_data.withColumn(
"aqi",
greatest(
col("pm25") / 35 * 100,
col("pm10") / 150 * 100,
col("o3") / 160 * 100,
col("no2") / 200 * 100,
col("so2") / 150 * 100,
col("co") / 4 * 100
)
)
# 按照国家标准对空气质量进行分级
aqi_levels = aqi_data.withColumn(
"quality_level",
when(col("aqi") <= 50, "优")
.when(col("aqi") <= 100, "良")
.when(col("aqi") <= 150, "轻度污染")
.when(col("aqi") <= 200, "中度污染")
.when(col("aqi") <= 300, "重度污染")
.otherwise("严重污染")
)
# 计算各污染物对AQI的贡献率
contribution_data = aqi_levels.withColumn(
"pm25_contribution", col("pm25") / 35 * 100 / col("aqi")
).withColumn(
"pm10_contribution", col("pm10") / 150 * 100 / col("aqi")
).withColumn(
"o3_contribution", col("o3") / 160 * 100 / col("aqi")
).withColumn(
"no2_contribution", col("no2") / 200 * 100 / col("aqi")
).withColumn(
"so2_contribution", col("so2") / 150 * 100 / col("aqi")
).withColumn(
"co_contribution", col("co") / 4 * 100 / col("aqi")
)
# 按日期聚合,计算每日平均AQI和主要污染物
daily_aqi = contribution_data.groupBy("date").agg(
avg("aqi").alias("avg_aqi"),
avg("pm25_contribution").alias("pm25_contrib"),
avg("pm10_contribution").alias("pm10_contrib"),
avg("o3_contribution").alias("o3_contrib"),
avg("no2_contribution").alias("no2_contrib"),
avg("so2_contribution").alias("so2_contrib"),
avg("co_contribution").alias("co_contrib")
)
# 识别每天的主要污染物
daily_aqi = daily_aqi.withColumn(
"primary_pollutant",
array_max(array(
struct(col("pm25_contrib"), lit("PM2.5")),
struct(col("pm10_contrib"), lit("PM10")),
struct(col("o3_contrib"), lit("O3")),
struct(col("no2_contrib"), lit("NO2")),
struct(col("so2_contrib"), lit("SO2")),
struct(col("co_contrib"), lit("CO"))
))["_2"]
)
# 将结果转换为Pandas DataFrame用于可视化
pandas_df = daily_aqi.toPandas()
return pandas_df
# 功能2: 气象因素影响可视化分析
def analyze_weather_impact(city_id, pollutant_type, time_period):
# 初始化Spark会话
spark = SparkSession.builder.appName("WeatherImpactAnalysis").getOrCreate()
# 加载空气质量数据和气象数据
air_data = spark.read.parquet(f"hdfs:///air_quality/city_{city_id}")
weather_data = spark.read.parquet(f"hdfs:///weather/city_{city_id}")
# 将两个数据集按时间戳和地点进行关联
joined_data = air_data.join(
weather_data,
(air_data.timestamp == weather_data.timestamp) &
(air_data.station_id == weather_data.station_id),
"inner"
)
# 根据时间段筛选数据
if time_period == "seasonal":
# 添加季节列
joined_data = joined_data.withColumn(
"season",
when((month(col("date")) >= 3) & (month(col("date")) <= 5), "春季")
.when((month(col("date")) >= 6) & (month(col("date")) <= 8), "夏季")
.when((month(col("date")) >= 9) & (month(col("date")) <= 11), "秋季")
.otherwise("冬季")
)
group_col = "season"
else:
# 按月分析
joined_data = joined_data.withColumn("month", month(col("date")))
group_col = "month"
# 计算气象因素与污染物浓度的相关性
weather_factors = ["temperature", "humidity", "wind_speed", "pressure", "rainfall"]
correlations = {}
for factor in weather_factors:
# 使用Spark SQL计算相关系数
correlation = joined_data.stat.corr(pollutant_type, factor)
correlations[factor] = correlation
# 分组分析不同气象条件下的污染物浓度
weather_impact = joined_data.groupBy(group_col).agg(
avg(col(pollutant_type)).alias(f"avg_{pollutant_type}"),
avg("temperature").alias("avg_temp"),
avg("humidity").alias("avg_humidity"),
avg("wind_speed").alias("avg_wind"),
avg("pressure").alias("avg_pressure"),
avg("rainfall").alias("avg_rainfall")
)
# 使用机器学习模型分析气象因素对污染物的影响
# 准备特征数据
assembler = VectorAssembler(
inputCols=weather_factors,
outputCol="features"
)
feature_data = assembler.transform(joined_data)
# 训练线性回归模型
lr = LinearRegression(featuresCol="features", labelCol=pollutant_type)
lr_model = lr.fit(feature_data)
# 提取模型系数,分析各因素影响权重
coefficients = lr_model.coefficients.toArray()
factor_importance = {weather_factors[i]: abs(coefficients[i]) for i in range(len(weather_factors))}
# 对风向数据进行特殊处理,分析不同风向对污染物传播的影响
wind_direction_impact = joined_data.groupBy("wind_direction").agg(
avg(col(pollutant_type)).alias(f"avg_{pollutant_type}")
).orderBy(f"avg_{pollutant_type}", ascending=False)
# 将结果转换为Python字典,用于前端可视化
result = {
"correlations": correlations,
"grouped_data": weather_impact.toPandas().to_dict("records"),
"factor_importance": factor_importance,
"wind_direction_impact": wind_direction_impact.toPandas().to_dict("records"),
"model_r2": lr_model.summary.r2
}
return result
# 功能3: 污染物关联性可视化分析
def analyze_pollutant_correlation(city_ids, start_date, end_date, time_granularity="daily"):
# 初始化Spark会话
spark = SparkSession.builder.appName("PollutantCorrelationAnalysis").getOrCreate()
# 加载多个城市的污染物数据
dfs = []
for city_id in city_ids:
city_data = spark.read.parquet(f"hdfs:///air_quality/city_{city_id}")
city_data = city_data.withColumn("city_id", lit(city_id))
dfs.append(city_data)
# 合并多个城市的数据
all_cities_data = reduce(DataFrame.unionAll, dfs)
# 过滤时间范围
filtered_data = all_cities_data.filter(
(col("date") >= start_date) & (col("date") <= end_date)
)
# 根据时间粒度聚合数据
if time_granularity == "hourly":
time_col = "timestamp"
elif time_granularity == "daily":
time_col = "date"
elif time_granularity == "weekly":
filtered_data = filtered_data.withColumn("week", weekofyear(col("date")))
filtered_data = filtered_data.withColumn("year", year(col("date")))
time_col = array("year", "week").alias("year_week")
elif time_granularity == "monthly":
filtered_data = filtered_data.withColumn("month", month(col("date")))
filtered_data = filtered_data.withColumn("year", year(col("date")))
time_col = array("year", "month").alias("year_month")
# 按时间和城市分组,计算各污染物平均浓度
grouped_data = filtered_data.groupBy(time_col, "city_id").agg(
avg("pm25").alias("avg_pm25"),
avg("pm10").alias("avg_pm10"),
avg("o3").alias("avg_o3"),
avg("no2").alias("avg_no2"),
avg("so2").alias("avg_so2"),
avg("co").alias("avg_co")
)
# 计算污染物之间的相关性矩阵
pollutants = ["avg_pm25", "avg_pm10", "avg_o3", "avg_no2", "avg_so2", "avg_co"]
correlation_matrix = {}
for p1 in pollutants:
correlation_matrix[p1] = {}
for p2 in pollutants:
corr = grouped_data.stat.corr(p1, p2)
correlation_matrix[p1][p2] = corr
# 使用Pandas进行更复杂的相关性分析
pandas_df = grouped_data.toPandas()
# 计算偏相关系数,排除其他污染物的影响
partial_corr_matrix = pd.DataFrame(index=pollutants, columns=pollutants)
for i, p1 in enumerate(pollutants):
for j, p2 in enumerate(pollutants):
if i == j:
partial_corr_matrix.loc[p1, p2] = 1.0
else:
control_vars = [p for p in pollutants if p != p1 and p != p2]
partial_corr = partial_correlation(pandas_df[p1], pandas_df[p2], pandas_df[control_vars])
partial_corr_matrix.loc[p1, p2] = partial_corr
# 使用Granger因果检验分析污染物之间的因果关系
granger_results = {}
for p1 in pollutants:
granger_results[p1] = {}
for p2 in pollutants:
if p1 != p2:
# 对每个城市单独进行Granger因果检验
city_granger = {}
for city_id in city_ids:
city_data = pandas_df[pandas_df['city_id'] == city_id]
if len(city_data) > 30: # 确保有足够的数据点
try:
result = grangercausalitytests(
city_data[[p1, p2]],
maxlag=7, # 最大滞后期为7
verbose=False
)
# 提取最显著的p值
min_p_value = min([result[i][0]['ssr_ftest'][1] for i in range(1, 8)])
city_granger[city_id] = min_p_value
except:
city_granger[city_id] = None
granger_results[p1][p2] = city_granger
# 使用主成分分析识别污染物组合模式
pca = PCA(n_components=3)
pca_result = pca.fit_transform(pandas_df[pollutants])
# 计算各主成分的解释方差比例和贡献率
explained_variance = pca.explained_variance_ratio_
component_matrix = pca.components_
# 构建结果字典
result = {
"correlation_matrix": correlation_matrix,
"partial_correlation_matrix": partial_corr_matrix.to_dict(),
"granger_causality": granger_results,
"pca": {
"explained_variance": explained_variance.tolist(),
"component_matrix": component_matrix.tolist(),
"pollutants": pollutants
}
}
return result
# 辅助函数:计算偏相关系数
def partial_correlation(x, y, control):
# 将控制变量转换为DataFrame
if isinstance(control, pd.Series):
control = pd.DataFrame(control)
# 对
六、基于大数据的城市空气污染数据分析系统-文档展示
七、END
💕💕文末获取源码联系计算机编程果茶熊