尚硅谷大数据项目《线上问诊之离线数仓》离线数据仓库项目实战教程

51 阅读9分钟

尚硅谷大数据项目《线上问诊之离线数仓》:医疗数据智能化的架构实践

在数字化转型浪潮中,医疗行业的数据价值日益凸显。尚硅谷的《线上问诊之离线数仓》项目为学习者提供了一个完整的大数据仓库实战案例,涵盖了从数据采集到业务洞察的全流程。

项目架构与业务背景

业务场景分析

线上问诊平台产生多维度数据:

  • 用户行为数据:问诊记录、药品搜索、医生评价
  • 业务交易数据:订单信息、支付记录、处方数据
  • 医疗专业数据:病症信息、用药建议、诊断结果
  • 系统日志数据:用户访问日志、API调用记录

技术架构设计

项目采用经典的Lambda架构,兼顾实时与离线处理:

数据源 → 数据采集 → 数据存储 → 数据处理 → 数据应用
    ↓        ↓          ↓         ↓         ↓
 MySQL   Flume/Canal   HDFS     Hive     Superset
 Kafka               HBase    SparkSQL   DataX
 日志文件            MySQL     Spark     报表系统

数据采集层实现

全量数据同步

-- 业务数据库表结构示例
CREATE TABLE medical_consultation (
    consultation_id BIGINT PRIMARY KEY COMMENT '问诊ID',
    user_id BIGINT NOT NULL COMMENT '用户ID',
    doctor_id BIGINT NOT NULL COMMENT '医生ID',
    department_id INT COMMENT '科室ID',
    symptoms TEXT COMMENT '症状描述',
    diagnosis_result VARCHAR(500) COMMENT '诊断结果',
    prescription_id BIGINT COMMENT '处方ID',
    consultation_fee DECIMAL(10,2) COMMENT '问诊费用',
    consultation_status TINYINT COMMENT '问诊状态',
    start_time DATETIME COMMENT '开始时间',
    end_time DATETIME COMMENT '结束时间',
    create_time DATETIME DEFAULT CURRENT_TIMESTAMP,
    update_time DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
) COMMENT '问诊记录表';

CREATE TABLE user_behavior_log (
    log_id BIGINT AUTO_INCREMENT PRIMARY KEY,
    user_id BIGINT NOT NULL,
    behavior_type VARCHAR(50) COMMENT '行为类型: browse/search/consult',
    page_url VARCHAR(500),
    search_keyword VARCHAR(200),
    doctor_id BIGINT,
    timestamp BIGINT COMMENT '行为时间戳',
    ip_address VARCHAR(50),
    user_agent VARCHAR(500)
) COMMENT '用户行为日志表';

增量数据采集

// 使用Canal进行MySQL增量数据采集
public class CanalDataCollector {
    private static final Logger logger = LoggerFactory.getLogger(CanalDataCollector.class);
    
    public void startCanalClient() {
        // 创建Canal连接
        CanalConnector connector = CanalConnectors.newClusterConnector(
            "192.168.1.100:2181", 
            "medical_warehouse", 
            "", 
            ""
        );
        
        try {
            connector.connect();
            connector.subscribe("medical_db\\..*");
            
            while (true) {
                Message message = connector.getWithoutAck(100);
                long batchId = message.getId();
                
                if (batchId != -1 && !message.getEntries().isEmpty()) {
                    processEntries(message.getEntries());
                    connector.ack(batchId); // 提交确认
                }
                
                Thread.sleep(1000);
            }
        } catch (Exception e) {
            logger.error("Canal客户端异常", e);
        } finally {
            connector.disconnect();
        }
    }
    
    private void processEntries(List<CanalEntry.Entry> entries) {
        for (CanalEntry.Entry entry : entries) {
            if (entry.getEntryType() == CanalEntry.EntryType.ROWDATA) {
                processRowChange(entry);
            }
        }
    }
    
    private void processRowChange(CanalEntry.Entry entry) {
        try {
            CanalEntry.RowChange rowChange = CanalEntry.RowChange.parseFrom(entry.getStoreValue());
            
            for (CanalEntry.RowData rowData : rowChange.getRowDatasList()) {
                // 构建Kafka消息
                String topic = "ods_" + entry.getHeader().getTableName();
                String key = buildMessageKey(entry, rowData);
                String value = buildMessageValue(entry, rowData);
                
                // 发送到Kafka
                kafkaProducer.send(new ProducerRecord<>(topic, key, value));
            }
        } catch (Exception e) {
            logger.error("解析binlog异常", e);
        }
    }
}

数据存储与ODS层建设

HDFS数据分区策略

-- 创建ODS层原始数据表
CREATE EXTERNAL TABLE ods_medical_consultation (
    consultation_id BIGINT,
    user_id BIGINT,
    doctor_id BIGINT,
    department_id INT,
    symptoms STRING,
    diagnosis_result STRING,
    prescription_id BIGINT,
    consultation_fee DECIMAL(10,2),
    consultation_status TINYINT,
    start_time STRING,
    end_time STRING,
    create_time STRING,
    update_time STRING
) COMMENT '问诊记录原始数据表'
PARTITIONED BY (dt STRING COMMENT '分区日期')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\001'
STORED AS TEXTFILE
LOCATION '/data/warehouse/ods/medical_consultation';

-- 加载数据到ODS层
ALTER TABLE ods_medical_consultation ADD PARTITION (dt='2024-01-20');
LOAD DATA INPATH '/data/source/medical_consultation/2024-01-20' 
INTO TABLE ods_medical_consultation PARTITION (dt='2024-01-20');

用户行为日志处理

// Spark处理用户行为日志
object UserBehaviorETL {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("UserBehaviorETL")
      .enableHiveSupport()
      .getOrCreate()
    
    import spark.implicits._
    
    // 读取Kafka中的用户行为数据
    val kafkaDF = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "kafka1:9092,kafka2:9092")
      .option("subscribe", "user_behavior")
      .load()
    
    // 解析JSON数据
    val behaviorSchema = new StructType()
      .add("log_id", LongType)
      .add("user_id", LongType)
      .add("behavior_type", StringType)
      .add("page_url", StringType)
      .add("search_keyword", StringType)
      .add("doctor_id", LongType)
      .add("timestamp", LongType)
      .add("ip_address", StringType)
      .add("user_agent", StringType)
    
    val parsedDF = kafkaDF
      .select(from_json($"value".cast(StringType), behaviorSchema).as("data"))
      .select("data.*")
      .withColumn("date_str", from_unixtime($"timestamp" / 1000, "yyyy-MM-dd"))
      .withColumn("hour_str", from_unixtime($"timestamp" / 1000, "HH"))
    
    // 写入HDFS分区
    val query = parsedDF
      .writeStream
      .outputMode("append")
      .format("parquet")
      .option("path", "/data/warehouse/ods/user_behavior")
      .option("checkpointLocation", "/checkpoint/user_behavior")
      .partitionBy("date_str", "hour_str")
      .start()
    
    query.awaitTermination()
  }
}

数据仓库维度建模

维度表设计

-- 时间维度表
CREATE TABLE dim_date (
    date_key INT COMMENT '日期代理键',
    actual_date DATE COMMENT '实际日期',
    year INT COMMENT '年',
    quarter INT COMMENT '季度',
    month INT COMMENT '月',
    week INT COMMENT '周',
    day_of_year INT COMMENT '年中第几天',
    day_of_month INT COMMENT '月中第几天',
    day_of_week INT COMMENT '周中第几天',
    is_weekend BOOLEAN COMMENT '是否周末',
    holiday_flag BOOLEAN COMMENT '是否节假日'
) COMMENT '时间维度表';

-- 医生维度表
CREATE TABLE dim_doctor (
    doctor_key BIGINT COMMENT '医生代理键',
    doctor_id BIGINT COMMENT '医生ID',
    doctor_name STRING COMMENT '医生姓名',
    department_id INT COMMENT '科室ID',
    department_name STRING COMMENT '科室名称',
    title STRING COMMENT '职称',
    specialty STRING COMMENT '专业方向',
    work_years INT COMMENT '工作年限',
    consultation_count INT COMMENT '问诊次数',
    avg_rating DECIMAL(3,2) COMMENT '平均评分',
    start_date DATE COMMENT '维度生效日期',
    end_date DATE COMMENT '维度失效日期',
    is_current BOOLEAN COMMENT '是否当前版本'
) COMMENT '医生维度表';

-- 用户维度表(缓慢变化维类型2)
CREATE TABLE dim_user (
    user_key BIGINT COMMENT '用户代理键',
    user_id BIGINT COMMENT '用户ID',
    user_name STRING COMMENT '用户姓名',
    gender TINYINT COMMENT '性别',
    age INT COMMENT '年龄',
    age_group STRING COMMENT '年龄段',
    province STRING COMMENT '省份',
    city STRING COMMENT '城市',
    registration_date DATE COMMENT '注册日期',
    user_level TINYINT COMMENT '用户等级',
    start_date DATE COMMENT '维度生效日期',
    end_date DATE COMMENT '维度失效日期',
    is_current BOOLEAN COMMENT '是否当前版本'
) COMMENT '用户维度表';

事实表设计

-- 问诊事实表
CREATE TABLE fact_consultation (
    consultation_id BIGINT COMMENT '问诊ID',
    date_key INT COMMENT '日期代理键',
    user_key BIGINT COMMENT '用户代理键',
    doctor_key BIGINT COMMENT '医生代理键',
    department_key INT COMMENT '科室代理键',
    time_key INT COMMENT '时间代理键',
    
    -- 度量字段
    consultation_duration INT COMMENT '问诊时长(分钟)',
    consultation_fee DECIMAL(10,2) COMMENT '问诊费用',
    prescription_count INT COMMENT '处方数量',
    drug_count INT COMMENT '药品数量',
    total_drug_fee DECIMAL(10,2) COMMENT '药品总费用',
    user_rating TINYINT COMMENT '用户评分',
    
    -- 退化维度
    symptoms STRING COMMENT '症状描述',
    diagnosis_result STRING COMMENT '诊断结果'
) COMMENT '问诊事实表'
PARTITIONED BY (dt STRING COMMENT '分区日期')
STORED AS PARQUET;

-- 用户行为事实表
CREATE TABLE fact_user_behavior (
    behavior_id BIGINT COMMENT '行为ID',
    date_key INT COMMENT '日期代理键',
    user_key BIGINT COMMENT '用户代理键',
    time_key INT COMMENT '时间代理键',
    behavior_type_key INT COMMENT '行为类型代理键',
    
    -- 度量字段
    page_stay_duration INT COMMENT '页面停留时长(秒)',
    search_count INT COMMENT '搜索次数',
    consultation_click_count INT COMMENT '问诊点击次数',
    
    -- 退化维度
    page_url STRING COMMENT '页面URL',
    search_keyword STRING COMMENT '搜索关键词',
    doctor_id BIGINT COMMENT '医生ID'
) COMMENT '用户行为事实表'
PARTITIONED BY (dt STRING COMMENT '分区日期')
STORED AS PARQUET;

ETL数据处理流程

维度表ETL脚本

-- 医生维度表ETL
INSERT OVERWRITE TABLE dim_doctor
SELECT 
    ROW_NUMBER() OVER(ORDER BY doctor_id) + 1000000 as doctor_key,
    doctor_id,
    doctor_name,
    department_id,
    department_name,
    title,
    specialty,
    work_years,
    consultation_count,
    avg_rating,
    '2024-01-01' as start_date,
    '9999-12-31' as end_date,
    true as is_current
FROM (
    SELECT 
        d.doctor_id,
        d.doctor_name,
        d.department_id,
        dep.department_name,
        d.title,
        d.specialty,
        d.work_years,
        COUNT(DISTINCT c.consultation_id) as consultation_count,
        AVG(COALESCE(c.user_rating, 0)) as avg_rating
    FROM ods_doctor d
    LEFT JOIN ods_department dep ON d.department_id = dep.department_id
    LEFT JOIN ods_medical_consultation c ON d.doctor_id = c.doctor_id
    WHERE d.dt = '2024-01-20'
    GROUP BY 
        d.doctor_id, d.doctor_name, d.department_id, dep.department_name,
        d.title, d.specialty, d.work_years
) t;

事实表ETL脚本

-- 问诊事实表ETL
INSERT OVERWRITE TABLE fact_consultation PARTITION (dt='2024-01-20')
SELECT 
    c.consultation_id,
    dd.date_key,
    du.user_key,
    ddoc.doctor_key,
    ddep.department_key,
    dt.time_key,
    
    -- 计算问诊时长
    CASE 
        WHEN c.start_time IS NOT NULL AND c.end_time IS NOT NULL 
        THEN TIMESTAMPDIFF(MINUTE, c.start_time, c.end_time)
        ELSE 0 
    END as consultation_duration,
    
    c.consultation_fee,
    
    -- 处方相关度量
    COUNT(DISTINCT p.prescription_id) as prescription_count,
    SUM(pd.drug_quantity) as drug_count,
    SUM(pd.drug_price * pd.drug_quantity) as total_drug_fee,
    
    COALESCE(c.user_rating, 0) as user_rating,
    
    c.symptoms,
    c.diagnosis_result
    
FROM ods_medical_consultation c
LEFT JOIN dim_date dd ON DATE(c.start_time) = dd.actual_date
LEFT JOIN dim_user du ON c.user_id = du.user_id AND du.is_current = true
LEFT JOIN dim_doctor ddoc ON c.doctor_id = ddoc.doctor_id AND ddoc.is_current = true
LEFT JOIN dim_department ddep ON c.department_id = ddep.department_id
LEFT JOIN dim_time dt ON HOUR(c.start_time) = dt.hour
LEFT JOIN ods_prescription p ON c.prescription_id = p.prescription_id
LEFT JOIN ods_prescription_detail pd ON p.prescription_id = pd.prescription_id

WHERE c.dt = '2024-01-20'
  AND c.consultation_status = 2  -- 已完成问诊
GROUP BY 
    c.consultation_id, dd.date_key, du.user_key, ddoc.doctor_key, 
    ddep.department_key, dt.time_key, c.consultation_fee, c.user_rating,
    c.symptoms, c.diagnosis_result, c.start_time, c.end_time;

数据质量监控

数据质量检查脚本

# 数据质量监控脚本
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

def data_quality_check(spark, check_date):
    """
    执行数据质量检查
    """
    quality_metrics = {}
    
    # 检查数据完整性
    completeness_check = spark.sql(f"""
        SELECT 
            table_name,
            COUNT(*) as total_count,
            SUM(CASE WHEN user_id IS NULL THEN 1 ELSE 0 END) as null_user_count,
            SUM(CASE WHEN doctor_id IS NULL THEN 1 ELSE 0 END) as null_doctor_count,
            (SUM(CASE WHEN user_id IS NULL THEN 1 ELSE 0 END) / COUNT(*)) as null_user_ratio
        FROM ods_medical_consultation 
        WHERE dt = '{check_date}'
        GROUP BY table_name
    """)
    
    # 检查数据一致性
    consistency_check = spark.sql(f"""
        SELECT 
            'consultation' as check_type,
            COUNT(DISTINCT consultation_id) as distinct_consultations,
            COUNT(*) as total_records,
            COUNT(DISTINCT consultation_id) / COUNT(*) as duplication_ratio
        FROM ods_medical_consultation 
        WHERE dt = '{check_date}'
    """)
    
    # 检查业务逻辑合理性
    business_logic_check = spark.sql(f"""
        SELECT 
            'fee_check' as check_type,
            COUNT(*) as total_consultations,
            SUM(CASE WHEN consultation_fee < 0 THEN 1 ELSE 0 END) as negative_fee_count,
            SUM(CASE WHEN consultation_fee > 1000 THEN 1 ELSE 0 END) as abnormal_fee_count
        FROM ods_medical_consultation 
        WHERE dt = '{check_date}'
    """)
    
    # 收集质量指标
    quality_metrics['completeness'] = completeness_check.collect()
    quality_metrics['consistency'] = consistency_check.collect()
    quality_metrics['business_logic'] = business_logic_check.collect()
    
    return quality_metrics

def generate_quality_report(metrics, check_date):
    """
    生成数据质量报告
    """
    report = f"""
数据质量检查报告
检查日期: {check_date}
生成时间: {datetime.now()}

1. 完整性检查:
   - 总记录数: {metrics['completeness'][0]['total_count']}
   - 用户ID空值率: {metrics['completeness'][0]['null_user_ratio']:.2%}

2. 一致性检查:
   - 重复数据比例: {metrics['consistency'][0]['duplication_ratio']:.2%}

3. 业务逻辑检查:
   - 异常费用记录: {metrics['business_logic'][0]['abnormal_fee_count']}
    """
    
    return report

数据应用与可视化

业务指标计算

-- 关键业务指标查询
-- 1. 每日问诊统计
SELECT 
    dd.actual_date as stat_date,
    COUNT(*) as consultation_count,
    COUNT(DISTINCT user_key) as unique_users,
    COUNT(DISTINCT doctor_key) as active_doctors,
    AVG(consultation_duration) as avg_duration,
    SUM(consultation_fee) as total_revenue
FROM fact_consultation fc
JOIN dim_date dd ON fc.date_key = dd.date_key
WHERE dd.actual_date >= DATE_SUB(CURRENT_DATE, 30)
GROUP BY dd.actual_date
ORDER BY stat_date DESC;

-- 2. 科室问诊排名
SELECT 
    department_name,
    COUNT(*) as consultation_count,
    AVG(user_rating) as avg_rating,
    SUM(consultation_fee) as total_revenue,
    COUNT(DISTINCT user_key) as served_users
FROM fact_consultation fc
JOIN dim_department dd ON fc.department_key = dd.department_key
WHERE fc.dt = '2024-01-20'
GROUP BY department_name
ORDER BY consultation_count DESC
LIMIT 10;

-- 3. 用户行为分析
SELECT 
    behavior_type,
    COUNT(*) as behavior_count,
    COUNT(DISTINCT user_key) as unique_users,
    AVG(page_stay_duration) as avg_stay_duration
FROM fact_user_behavior
WHERE dt = '2024-01-20'
GROUP BY behavior_type
ORDER BY behavior_count DESC;

项目总结与价值

技术收获

  1. 架构设计能力:掌握Lambda架构在医疗场景的应用
  2. 数据建模技能:熟练运用维度建模方法
  3. ETL开发经验:具备完整的数据 pipeline 开发能力
  4. 质量保障意识:建立数据质量管理体系

业务价值

  1. 运营分析:支持问诊业务的多维度分析
  2. 用户洞察:理解用户行为模式,优化产品体验
  3. 医生评估:建立医生服务质量的评估体系
  4. 决策支持:为管理层提供数据驱动的决策依据

通过尚硅谷的《线上问诊之离线数仓》项目实战,学习者能够全面掌握大数据仓库的建设方法,为医疗行业数字化转型提供坚实的数据基础架构支持。