前言
- 💖💖作者:计算机程序员小杨
- 💙💙个人简介:我是一名计算机相关专业的从业者,擅长Java、微信小程序、Python、Golang、安卓Android等多个IT方向。会做一些项目定制化开发、代码讲解、答辩教学、文档编写、也懂一些降重方面的技巧。热爱技术,喜欢钻研新工具和框架,也乐于通过代码解决实际问题,大家有技术代码这一块的问题可以问我!
- 💛💛想说的话:感谢大家的关注与支持!
- 💕💕文末获取源码联系 计算机程序员小杨
- 💜💜
- 网站实战项目
- 安卓/小程序实战项目
- 大数据实战项目
- 深度学习实战项目
- 计算机毕业设计选题
- 💜💜
一.开发工具简介
- 大数据框架:Hadoop+Spark(本次没用Hive,支持定制)
- 开发语言:Python+Java(两个版本都支持)
- 后端框架:Django+Spring Boot(Spring+SpringMVC+Mybatis)(两个版本都支持)
- 前端:Vue+ElementUI+Echarts+HTML+CSS+JavaScript+jQuery
- 详细技术点:Hadoop、HDFS、Spark、Spark SQL、Pandas、NumPy
- 数据库:MySQL
二.系统内容简介
基于大数据的在线教育投融数据可视化分析系统是一个集数据采集、处理、分析和可视化展示于一体的综合性平台。系统采用Hadoop+Spark大数据技术栈作为核心处理引擎,能够高效处理海量的在线教育行业投融资数据,通过Spark SQL进行复杂的数据查询和统计分析。后端基于Spring Boot框架构建RESTful API接口,前端采用Vue+ElementUI+Echarts技术栈实现响应式的数据可视化界面。系统主要功能模块包括用户管理、在线教育投融数据管理、融资阶段分布分析、投资机构行为分析、行业总体趋势分析和热门细分赛道分析等核心功能。通过HDFS分布式文件系统存储原始数据,利用Spark的内存计算能力对教育投融资数据进行实时分析处理,结合Pandas和NumPy进行数据清洗和统计计算,最终通过Echarts图表库将分析结果以直观的可视化图表形式展现给用户,为教育行业从业者、投资机构和政策制定者提供数据支撑和决策参考。
三.系统功能演示
大数据专业导师推荐:Hadoop+Spark实现在线教育投融数据可视化分析系统最佳实践
四.系统界面展示
五.系统源码展示
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, sum, avg, desc, asc, when, year, month
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import json
spark = SparkSession.builder.appName("OnlineEducationInvestmentAnalysis").config("spark.sql.adaptive.enabled", "true").config("spark.sql.adaptive.coalescePartitions.enabled", "true").getOrCreate()
def analyzeFundingStageDistribution():
df = spark.read.option("header", "true").csv("hdfs://localhost:9000/education_investment_data/*.csv")
df = df.filter(col("funding_stage").isNotNull() & col("investment_amount").isNotNull())
df = df.withColumn("investment_amount", col("investment_amount").cast(DoubleType()))
stage_stats = df.groupBy("funding_stage").agg(
count("*").alias("deal_count"),
sum("investment_amount").alias("total_amount"),
avg("investment_amount").alias("avg_amount")
).orderBy(desc("total_amount"))
total_deals = df.count()
total_amount = df.agg(sum("investment_amount")).collect()[0][0]
stage_distribution = stage_stats.withColumn("deal_percentage",
(col("deal_count") / total_deals * 100).cast("decimal(5,2)")
).withColumn("amount_percentage",
(col("total_amount") / total_amount * 100).cast("decimal(5,2)")
)
quarterly_trend = df.withColumn("year", year(col("investment_date"))).withColumn("quarter",
when(month(col("investment_date")).between(1, 3), "Q1")
.when(month(col("investment_date")).between(4, 6), "Q2")
.when(month(col("investment_date")).between(7, 9), "Q3")
.otherwise("Q4")
).groupBy("year", "quarter", "funding_stage").agg(
count("*").alias("stage_deals"),
sum("investment_amount").alias("stage_amount")
).orderBy("year", "quarter", "funding_stage")
stage_comparison = df.groupBy("funding_stage").agg(
avg("investment_amount").alias("avg_investment"),
count("*").alias("total_deals")
).withColumn("investment_efficiency",
col("avg_investment") / col("total_deals")
).orderBy(desc("investment_efficiency"))
result_data = {
"stage_distribution": stage_distribution.toPandas().to_dict('records'),
"quarterly_trend": quarterly_trend.toPandas().to_dict('records'),
"stage_comparison": stage_comparison.toPandas().to_dict('records'),
"summary": {
"total_deals": total_deals,
"total_investment": float(total_amount) if total_amount else 0,
"avg_deal_size": float(total_amount / total_deals) if total_deals > 0 else 0
}
}
return json.dumps(result_data, ensure_ascii=False, default=str)
def analyzeInvestorBehavior():
df = spark.read.option("header", "true").csv("hdfs://localhost:9000/education_investment_data/*.csv")
df = df.filter(col("investor_name").isNotNull() & col("investment_amount").isNotNull())
df = df.withColumn("investment_amount", col("investment_amount").cast(DoubleType()))
investor_stats = df.groupBy("investor_name").agg(
count("*").alias("investment_count"),
sum("investment_amount").alias("total_invested"),
avg("investment_amount").alias("avg_investment"),
countDistinct("company_name").alias("portfolio_companies")
).filter(col("investment_count") >= 3).orderBy(desc("total_invested"))
top_investors = investor_stats.limit(20)
investor_stage_preference = df.groupBy("investor_name", "funding_stage").agg(
count("*").alias("stage_investments"),
sum("investment_amount").alias("stage_amount")
).join(investor_stats.select("investor_name", "investment_count"), "investor_name").withColumn(
"stage_preference_ratio", col("stage_investments") / col("investment_count")
).filter(col("stage_preference_ratio") > 0.3).orderBy("investor_name", desc("stage_preference_ratio"))
investor_sector_focus = df.groupBy("investor_name", "education_sector").agg(
count("*").alias("sector_investments"),
sum("investment_amount").alias("sector_amount")
).join(investor_stats.select("investor_name", "investment_count"), "investor_name").withColumn(
"sector_focus_ratio", col("sector_investments") / col("investment_count")
).filter(col("sector_focus_ratio") > 0.25).orderBy("investor_name", desc("sector_focus_ratio"))
investment_frequency = df.withColumn("investment_year", year(col("investment_date"))).groupBy(
"investor_name", "investment_year"
).agg(count("*").alias("yearly_investments")).join(
investor_stats.select("investor_name", "investment_count"), "investor_name"
).filter(col("investment_count") >= 5).orderBy("investor_name", "investment_year")
co_investment_analysis = df.alias("df1").join(
df.alias("df2"),
(col("df1.company_name") == col("df2.company_name")) &
(col("df1.funding_round") == col("df2.funding_round")) &
(col("df1.investor_name") != col("df2.investor_name"))
).select(
col("df1.investor_name").alias("investor_a"),
col("df2.investor_name").alias("investor_b"),
col("df1.company_name"),
col("df1.investment_amount").alias("amount_a"),
col("df2.investment_amount").alias("amount_b")
).groupBy("investor_a", "investor_b").agg(
count("*").alias("co_investment_count"),
sum(col("amount_a") + col("amount_b")).alias("total_co_investment")
).filter(col("co_investment_count") >= 2).orderBy(desc("co_investment_count"))
result_data = {
"top_investors": top_investors.toPandas().to_dict('records'),
"stage_preferences": investor_stage_preference.toPandas().to_dict('records'),
"sector_focus": investor_sector_focus.toPandas().to_dict('records'),
"investment_frequency": investment_frequency.toPandas().to_dict('records'),
"co_investments": co_investment_analysis.toPandas().to_dict('records')
}
return json.dumps(result_data, ensure_ascii=False, default=str)
def analyzeIndustryTrends():
df = spark.read.option("header", "true").csv("hdfs://localhost:9000/education_investment_data/*.csv")
df = df.filter(col("investment_date").isNotNull() & col("investment_amount").isNotNull())
df = df.withColumn("investment_amount", col("investment_amount").cast(DoubleType()))
df = df.withColumn("investment_year", year(col("investment_date")))
df = df.withColumn("investment_month", month(col("investment_date")))
yearly_trends = df.groupBy("investment_year").agg(
count("*").alias("annual_deals"),
sum("investment_amount").alias("annual_investment"),
avg("investment_amount").alias("avg_deal_size"),
countDistinct("company_name").alias("companies_funded")
).orderBy("investment_year")
monthly_trends = df.groupBy("investment_year", "investment_month").agg(
count("*").alias("monthly_deals"),
sum("investment_amount").alias("monthly_investment")
).orderBy("investment_year", "investment_month")
sector_trends = df.groupBy("investment_year", "education_sector").agg(
count("*").alias("sector_deals"),
sum("investment_amount").alias("sector_investment")
).orderBy("investment_year", desc("sector_investment"))
growth_analysis = yearly_trends.withColumn(
"deal_growth_rate",
when(lag("annual_deals").over(Window.orderBy("investment_year")).isNull(), 0)
.otherwise((col("annual_deals") - lag("annual_deals").over(Window.orderBy("investment_year"))) /
lag("annual_deals").over(Window.orderBy("investment_year")) * 100)
).withColumn(
"investment_growth_rate",
when(lag("annual_investment").over(Window.orderBy("investment_year")).isNull(), 0)
.otherwise((col("annual_investment") - lag("annual_investment").over(Window.orderBy("investment_year"))) /
lag("annual_investment").over(Window.orderBy("investment_year")) * 100)
)
market_concentration = df.groupBy("investment_year").agg(
countDistinct("investor_name").alias("active_investors"),
countDistinct("company_name").alias("funded_companies")
).withColumn("investor_company_ratio",
col("active_investors") / col("funded_companies")
).orderBy("investment_year")
seasonal_patterns = df.groupBy("investment_month").agg(
count("*").alias("month_total_deals"),
sum("investment_amount").alias("month_total_investment"),
avg("investment_amount").alias("month_avg_deal")
).withColumn("seasonal_index",
col("month_total_deals") / avg("month_total_deals").over()
).orderBy("investment_month")
result_data = {
"yearly_trends": yearly_trends.toPandas().to_dict('records'),
"monthly_trends": monthly_trends.toPandas().to_dict('records'),
"sector_trends": sector_trends.toPandas().to_dict('records'),
"growth_analysis": growth_analysis.toPandas().to_dict('records'),
"market_concentration": market_concentration.toPandas().to_dict('records'),
"seasonal_patterns": seasonal_patterns.toPandas().to_dict('records')
}
return json.dumps(result_data, ensure_ascii=False, default=str)