企业级大数据应用实例:宫颈癌风险因素分析与可视化系统技术详解

54 阅读5分钟

前言

一.开发工具简介

  • 大数据框架:Hadoop+Spark(本次没用Hive,支持定制)
  • 开发语言:Python+Java(两个版本都支持)
  • 后端框架:Django+Spring Boot(Spring+SpringMVC+Mybatis)(两个版本都支持)
  • 前端:Vue+ElementUI+Echarts+HTML+CSS+JavaScript+jQuery
  • 详细技术点:Hadoop、HDFS、Spark、Spark SQL、Pandas、NumPy
  • 数据库:MySQL

二.系统内容简介

《基于大数据的宫颈癌风险因素分析与可视化系统》是一个融合Hadoop分布式存储、Spark大数据计算引擎和现代Web技术的医疗数据分析平台。该系统采用Python作为主要开发语言,结合Django框架构建后端服务架构,前端运用Vue+ElementUI+Echarts技术栈实现交互界面和数据可视化展示。系统核心功能涵盖用户管理、宫颈癌风险数据管理、数据详情查看、可视化大屏展示、人口学与生活方式分析、患者风险画像分析、筛查方法验证分析以及性行为及STDS分析等八大模块。通过HDFS分布式文件系统存储海量医疗数据,利用Spark SQL和Pandas、NumPy等数据处理工具进行深度数据挖掘和统计分析,最终以直观的图表和大屏形式展现宫颈癌风险因素的关联性和预测结果,为医疗决策提供数据支撑。

三.系统功能演示

企业级大数据应用实例:宫颈癌风险因素分析与可视化系统技术详解

四.系统界面展示

在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述

五.系统源码展示


from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, when, desc
from django.http import JsonResponse
from django.views.decorators.csrf import csrf_exempt
import pandas as pd
import numpy as np
import json

spark = SparkSession.builder.appName("CervicalCancerAnalysis").config("spark.sql.adaptive.enabled", "true").getOrCreate()

def cervical_cancer_risk_analysis(request):
    df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/cervical_cancer").option("dbtable", "risk_data").option("user", "root").option("password", "password").load()
    age_risk = df.groupBy("age_group").agg(count("*").alias("total_count"), avg("risk_score").alias("avg_risk")).orderBy(desc("avg_risk"))
    lifestyle_risk = df.groupBy("smoking_status", "drinking_status").agg(count("*").alias("count"), avg("risk_score").alias("risk")).orderBy(desc("risk"))
    high_risk_patients = df.filter(col("risk_score") > 0.7).select("patient_id", "age", "risk_score", "main_risk_factors")
    infection_correlation = df.groupBy("hpv_status", "std_history").agg(count("*").alias("patient_count"), avg("risk_score").alias("average_risk"))
    screening_effectiveness = df.groupBy("screening_method").agg(count("*").alias("total_screened"), count(when(col("cancer_detected") == 1, 1)).alias("detected_cases"))
    screening_effectiveness = screening_effectiveness.withColumn("detection_rate", col("detected_cases") / col("total_screened"))
    regional_distribution = df.groupBy("region").agg(count("*").alias("case_count"), avg("risk_score").alias("avg_risk_score"))
    age_risk_pandas = age_risk.toPandas()
    lifestyle_risk_pandas = lifestyle_risk.toPandas()
    high_risk_pandas = high_risk_patients.toPandas()
    result_data = {
        "age_risk_analysis": age_risk_pandas.to_dict('records'),
        "lifestyle_analysis": lifestyle_risk_pandas.to_dict('records'),
        "high_risk_patients": high_risk_pandas.to_dict('records'),
        "infection_correlation": infection_correlation.toPandas().to_dict('records'),
        "screening_effectiveness": screening_effectiveness.toPandas().to_dict('records'),
        "regional_distribution": regional_distribution.toPandas().to_dict('records')
    }
    return JsonResponse(result_data)

def patient_risk_profile_analysis(request):
    patient_id = request.GET.get('patient_id')
    df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/cervical_cancer").option("dbtable", "patient_details").option("user", "root").option("password", "password").load()
    patient_data = df.filter(col("patient_id") == patient_id).collect()[0]
    similar_patients = df.filter((col("age_group") == patient_data["age_group"]) & (col("smoking_status") == patient_data["smoking_status"]) & (col("patient_id") != patient_id))
    risk_factors = ["age", "smoking_years", "sexual_partners", "pregnancies", "std_history_count"]
    patient_scores = {}
    for factor in risk_factors:
        factor_avg = df.agg(avg(col(factor))).collect()[0][0]
        patient_value = patient_data[factor]
        if patient_value > factor_avg:
            risk_level = "高风险" if patient_value > factor_avg * 1.5 else "中等风险"
        else:
            risk_level = "低风险"
        patient_scores[factor] = {"value": patient_value, "average": factor_avg, "risk_level": risk_level}
    similar_patients_data = similar_patients.select("patient_id", "risk_score", "cancer_outcome").limit(10).toPandas()
    risk_prediction_features = np.array([[patient_data["age"], patient_data["smoking_years"], patient_data["sexual_partners"], patient_data["pregnancies"], patient_data["std_history_count"]]])
    base_risk = 0.1
    age_factor = 0.02 * (patient_data["age"] - 25) if patient_data["age"] > 25 else 0
    smoking_factor = 0.03 * patient_data["smoking_years"]
    partner_factor = 0.01 * patient_data["sexual_partners"] if patient_data["sexual_partners"] > 2 else 0
    pregnancy_factor = 0.005 * patient_data["pregnancies"] if patient_data["pregnancies"] > 3 else 0
    std_factor = 0.05 * patient_data["std_history_count"]
    predicted_risk = min(base_risk + age_factor + smoking_factor + partner_factor + pregnancy_factor + std_factor, 0.95)
    profile_result = {
        "patient_basic_info": dict(patient_data.asDict()),
        "risk_factor_scores": patient_scores,
        "similar_patients": similar_patients_data.to_dict('records'),
        "predicted_risk_score": round(predicted_risk, 3),
        "risk_category": "高风险" if predicted_risk > 0.6 else "中等风险" if predicted_risk > 0.3 else "低风险"
    }
    return JsonResponse(profile_result)

def visualization_dashboard_data(request):
    df = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306/cervical_cancer").option("dbtable", "comprehensive_data").option("user", "root").option("password", "password").load()
    total_patients = df.count()
    high_risk_count = df.filter(col("risk_score") > 0.6).count()
    medium_risk_count = df.filter((col("risk_score") > 0.3) & (col("risk_score") <= 0.6)).count()
    low_risk_count = df.filter(col("risk_score") <= 0.3).count()
    age_distribution = df.groupBy("age_group").agg(count("*").alias("count"), avg("risk_score").alias("avg_risk")).orderBy("age_group")
    monthly_trends = df.groupBy("screening_month").agg(count("*").alias("screening_count"), count(when(col("risk_score") > 0.6, 1)).alias("high_risk_count"))
    regional_heatmap = df.groupBy("province", "city").agg(count("*").alias("case_count"), avg("risk_score").alias("avg_risk"))
    screening_methods_comparison = df.groupBy("primary_screening_method").agg(count("*").alias("usage_count"), avg("detection_accuracy").alias("avg_accuracy"))
    risk_factor_correlation = df.select("smoking_status", "drinking_status", "hpv_infection", "std_history", "risk_score")
    correlation_matrix = risk_factor_correlation.toPandas().corr()
    top_risk_factors = df.groupBy("primary_risk_factor").agg(count("*").alias("frequency")).orderBy(desc("frequency")).limit(10)
    dashboard_data = {
        "summary_statistics": {
            "total_patients": total_patients,
            "high_risk_patients": high_risk_count,
            "medium_risk_patients": medium_risk_count,
            "low_risk_patients": low_risk_count,
            "high_risk_percentage": round((high_risk_count / total_patients) * 100, 2)
        },
        "age_distribution": age_distribution.toPandas().to_dict('records'),
        "monthly_screening_trends": monthly_trends.toPandas().to_dict('records'),
        "regional_distribution": regional_heatmap.toPandas().to_dict('records'),
        "screening_methods_analysis": screening_methods_comparison.toPandas().to_dict('records'),
        "risk_factor_correlation": correlation_matrix.to_dict(),
        "top_risk_factors": top_risk_factors.toPandas().to_dict('records')
    }
    return JsonResponse(dashboard_data)




六.系统文档展示

在这里插入图片描述

结束