尚硅谷大数据项目《线上问诊之离线数仓》:医疗数据智能化的架构实践
在数字化转型浪潮中,医疗行业的数据价值日益凸显。尚硅谷的《线上问诊之离线数仓》项目为学习者提供了一个完整的大数据仓库实战案例,涵盖了从数据采集到业务洞察的全流程。
项目架构与业务背景
业务场景分析
线上问诊平台产生多维度数据:
- 用户行为数据:问诊记录、药品搜索、医生评价
- 业务交易数据:订单信息、支付记录、处方数据
- 医疗专业数据:病症信息、用药建议、诊断结果
- 系统日志数据:用户访问日志、API调用记录
技术架构设计
项目采用经典的Lambda架构,兼顾实时与离线处理:
数据源 → 数据采集 → 数据存储 → 数据处理 → 数据应用
↓ ↓ ↓ ↓ ↓
MySQL Flume/Canal HDFS Hive Superset
Kafka HBase SparkSQL DataX
日志文件 MySQL Spark 报表系统
数据采集层实现
全量数据同步
-- 业务数据库表结构示例
CREATE TABLE medical_consultation (
consultation_id BIGINT PRIMARY KEY COMMENT '问诊ID',
user_id BIGINT NOT NULL COMMENT '用户ID',
doctor_id BIGINT NOT NULL COMMENT '医生ID',
department_id INT COMMENT '科室ID',
symptoms TEXT COMMENT '症状描述',
diagnosis_result VARCHAR(500) COMMENT '诊断结果',
prescription_id BIGINT COMMENT '处方ID',
consultation_fee DECIMAL(10,2) COMMENT '问诊费用',
consultation_status TINYINT COMMENT '问诊状态',
start_time DATETIME COMMENT '开始时间',
end_time DATETIME COMMENT '结束时间',
create_time DATETIME DEFAULT CURRENT_TIMESTAMP,
update_time DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
) COMMENT '问诊记录表';
CREATE TABLE user_behavior_log (
log_id BIGINT AUTO_INCREMENT PRIMARY KEY,
user_id BIGINT NOT NULL,
behavior_type VARCHAR(50) COMMENT '行为类型: browse/search/consult',
page_url VARCHAR(500),
search_keyword VARCHAR(200),
doctor_id BIGINT,
timestamp BIGINT COMMENT '行为时间戳',
ip_address VARCHAR(50),
user_agent VARCHAR(500)
) COMMENT '用户行为日志表';
增量数据采集
// 使用Canal进行MySQL增量数据采集
public class CanalDataCollector {
private static final Logger logger = LoggerFactory.getLogger(CanalDataCollector.class);
public void startCanalClient() {
// 创建Canal连接
CanalConnector connector = CanalConnectors.newClusterConnector(
"192.168.1.100:2181",
"medical_warehouse",
"",
""
);
try {
connector.connect();
connector.subscribe("medical_db\\..*");
while (true) {
Message message = connector.getWithoutAck(100);
long batchId = message.getId();
if (batchId != -1 && !message.getEntries().isEmpty()) {
processEntries(message.getEntries());
connector.ack(batchId); // 提交确认
}
Thread.sleep(1000);
}
} catch (Exception e) {
logger.error("Canal客户端异常", e);
} finally {
connector.disconnect();
}
}
private void processEntries(List<CanalEntry.Entry> entries) {
for (CanalEntry.Entry entry : entries) {
if (entry.getEntryType() == CanalEntry.EntryType.ROWDATA) {
processRowChange(entry);
}
}
}
private void processRowChange(CanalEntry.Entry entry) {
try {
CanalEntry.RowChange rowChange = CanalEntry.RowChange.parseFrom(entry.getStoreValue());
for (CanalEntry.RowData rowData : rowChange.getRowDatasList()) {
// 构建Kafka消息
String topic = "ods_" + entry.getHeader().getTableName();
String key = buildMessageKey(entry, rowData);
String value = buildMessageValue(entry, rowData);
// 发送到Kafka
kafkaProducer.send(new ProducerRecord<>(topic, key, value));
}
} catch (Exception e) {
logger.error("解析binlog异常", e);
}
}
}
数据存储与ODS层建设
HDFS数据分区策略
-- 创建ODS层原始数据表
CREATE EXTERNAL TABLE ods_medical_consultation (
consultation_id BIGINT,
user_id BIGINT,
doctor_id BIGINT,
department_id INT,
symptoms STRING,
diagnosis_result STRING,
prescription_id BIGINT,
consultation_fee DECIMAL(10,2),
consultation_status TINYINT,
start_time STRING,
end_time STRING,
create_time STRING,
update_time STRING
) COMMENT '问诊记录原始数据表'
PARTITIONED BY (dt STRING COMMENT '分区日期')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\001'
STORED AS TEXTFILE
LOCATION '/data/warehouse/ods/medical_consultation';
-- 加载数据到ODS层
ALTER TABLE ods_medical_consultation ADD PARTITION (dt='2024-01-20');
LOAD DATA INPATH '/data/source/medical_consultation/2024-01-20'
INTO TABLE ods_medical_consultation PARTITION (dt='2024-01-20');
用户行为日志处理
// Spark处理用户行为日志
object UserBehaviorETL {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("UserBehaviorETL")
.enableHiveSupport()
.getOrCreate()
import spark.implicits._
// 读取Kafka中的用户行为数据
val kafkaDF = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "kafka1:9092,kafka2:9092")
.option("subscribe", "user_behavior")
.load()
// 解析JSON数据
val behaviorSchema = new StructType()
.add("log_id", LongType)
.add("user_id", LongType)
.add("behavior_type", StringType)
.add("page_url", StringType)
.add("search_keyword", StringType)
.add("doctor_id", LongType)
.add("timestamp", LongType)
.add("ip_address", StringType)
.add("user_agent", StringType)
val parsedDF = kafkaDF
.select(from_json($"value".cast(StringType), behaviorSchema).as("data"))
.select("data.*")
.withColumn("date_str", from_unixtime($"timestamp" / 1000, "yyyy-MM-dd"))
.withColumn("hour_str", from_unixtime($"timestamp" / 1000, "HH"))
// 写入HDFS分区
val query = parsedDF
.writeStream
.outputMode("append")
.format("parquet")
.option("path", "/data/warehouse/ods/user_behavior")
.option("checkpointLocation", "/checkpoint/user_behavior")
.partitionBy("date_str", "hour_str")
.start()
query.awaitTermination()
}
}
数据仓库维度建模
维度表设计
-- 时间维度表
CREATE TABLE dim_date (
date_key INT COMMENT '日期代理键',
actual_date DATE COMMENT '实际日期',
year INT COMMENT '年',
quarter INT COMMENT '季度',
month INT COMMENT '月',
week INT COMMENT '周',
day_of_year INT COMMENT '年中第几天',
day_of_month INT COMMENT '月中第几天',
day_of_week INT COMMENT '周中第几天',
is_weekend BOOLEAN COMMENT '是否周末',
holiday_flag BOOLEAN COMMENT '是否节假日'
) COMMENT '时间维度表';
-- 医生维度表
CREATE TABLE dim_doctor (
doctor_key BIGINT COMMENT '医生代理键',
doctor_id BIGINT COMMENT '医生ID',
doctor_name STRING COMMENT '医生姓名',
department_id INT COMMENT '科室ID',
department_name STRING COMMENT '科室名称',
title STRING COMMENT '职称',
specialty STRING COMMENT '专业方向',
work_years INT COMMENT '工作年限',
consultation_count INT COMMENT '问诊次数',
avg_rating DECIMAL(3,2) COMMENT '平均评分',
start_date DATE COMMENT '维度生效日期',
end_date DATE COMMENT '维度失效日期',
is_current BOOLEAN COMMENT '是否当前版本'
) COMMENT '医生维度表';
-- 用户维度表(缓慢变化维类型2)
CREATE TABLE dim_user (
user_key BIGINT COMMENT '用户代理键',
user_id BIGINT COMMENT '用户ID',
user_name STRING COMMENT '用户姓名',
gender TINYINT COMMENT '性别',
age INT COMMENT '年龄',
age_group STRING COMMENT '年龄段',
province STRING COMMENT '省份',
city STRING COMMENT '城市',
registration_date DATE COMMENT '注册日期',
user_level TINYINT COMMENT '用户等级',
start_date DATE COMMENT '维度生效日期',
end_date DATE COMMENT '维度失效日期',
is_current BOOLEAN COMMENT '是否当前版本'
) COMMENT '用户维度表';
事实表设计
-- 问诊事实表
CREATE TABLE fact_consultation (
consultation_id BIGINT COMMENT '问诊ID',
date_key INT COMMENT '日期代理键',
user_key BIGINT COMMENT '用户代理键',
doctor_key BIGINT COMMENT '医生代理键',
department_key INT COMMENT '科室代理键',
time_key INT COMMENT '时间代理键',
-- 度量字段
consultation_duration INT COMMENT '问诊时长(分钟)',
consultation_fee DECIMAL(10,2) COMMENT '问诊费用',
prescription_count INT COMMENT '处方数量',
drug_count INT COMMENT '药品数量',
total_drug_fee DECIMAL(10,2) COMMENT '药品总费用',
user_rating TINYINT COMMENT '用户评分',
-- 退化维度
symptoms STRING COMMENT '症状描述',
diagnosis_result STRING COMMENT '诊断结果'
) COMMENT '问诊事实表'
PARTITIONED BY (dt STRING COMMENT '分区日期')
STORED AS PARQUET;
-- 用户行为事实表
CREATE TABLE fact_user_behavior (
behavior_id BIGINT COMMENT '行为ID',
date_key INT COMMENT '日期代理键',
user_key BIGINT COMMENT '用户代理键',
time_key INT COMMENT '时间代理键',
behavior_type_key INT COMMENT '行为类型代理键',
-- 度量字段
page_stay_duration INT COMMENT '页面停留时长(秒)',
search_count INT COMMENT '搜索次数',
consultation_click_count INT COMMENT '问诊点击次数',
-- 退化维度
page_url STRING COMMENT '页面URL',
search_keyword STRING COMMENT '搜索关键词',
doctor_id BIGINT COMMENT '医生ID'
) COMMENT '用户行为事实表'
PARTITIONED BY (dt STRING COMMENT '分区日期')
STORED AS PARQUET;
ETL数据处理流程
维度表ETL脚本
-- 医生维度表ETL
INSERT OVERWRITE TABLE dim_doctor
SELECT
ROW_NUMBER() OVER(ORDER BY doctor_id) + 1000000 as doctor_key,
doctor_id,
doctor_name,
department_id,
department_name,
title,
specialty,
work_years,
consultation_count,
avg_rating,
'2024-01-01' as start_date,
'9999-12-31' as end_date,
true as is_current
FROM (
SELECT
d.doctor_id,
d.doctor_name,
d.department_id,
dep.department_name,
d.title,
d.specialty,
d.work_years,
COUNT(DISTINCT c.consultation_id) as consultation_count,
AVG(COALESCE(c.user_rating, 0)) as avg_rating
FROM ods_doctor d
LEFT JOIN ods_department dep ON d.department_id = dep.department_id
LEFT JOIN ods_medical_consultation c ON d.doctor_id = c.doctor_id
WHERE d.dt = '2024-01-20'
GROUP BY
d.doctor_id, d.doctor_name, d.department_id, dep.department_name,
d.title, d.specialty, d.work_years
) t;
事实表ETL脚本
-- 问诊事实表ETL
INSERT OVERWRITE TABLE fact_consultation PARTITION (dt='2024-01-20')
SELECT
c.consultation_id,
dd.date_key,
du.user_key,
ddoc.doctor_key,
ddep.department_key,
dt.time_key,
-- 计算问诊时长
CASE
WHEN c.start_time IS NOT NULL AND c.end_time IS NOT NULL
THEN TIMESTAMPDIFF(MINUTE, c.start_time, c.end_time)
ELSE 0
END as consultation_duration,
c.consultation_fee,
-- 处方相关度量
COUNT(DISTINCT p.prescription_id) as prescription_count,
SUM(pd.drug_quantity) as drug_count,
SUM(pd.drug_price * pd.drug_quantity) as total_drug_fee,
COALESCE(c.user_rating, 0) as user_rating,
c.symptoms,
c.diagnosis_result
FROM ods_medical_consultation c
LEFT JOIN dim_date dd ON DATE(c.start_time) = dd.actual_date
LEFT JOIN dim_user du ON c.user_id = du.user_id AND du.is_current = true
LEFT JOIN dim_doctor ddoc ON c.doctor_id = ddoc.doctor_id AND ddoc.is_current = true
LEFT JOIN dim_department ddep ON c.department_id = ddep.department_id
LEFT JOIN dim_time dt ON HOUR(c.start_time) = dt.hour
LEFT JOIN ods_prescription p ON c.prescription_id = p.prescription_id
LEFT JOIN ods_prescription_detail pd ON p.prescription_id = pd.prescription_id
WHERE c.dt = '2024-01-20'
AND c.consultation_status = 2 -- 已完成问诊
GROUP BY
c.consultation_id, dd.date_key, du.user_key, ddoc.doctor_key,
ddep.department_key, dt.time_key, c.consultation_fee, c.user_rating,
c.symptoms, c.diagnosis_result, c.start_time, c.end_time;
数据质量监控
数据质量检查脚本
# 数据质量监控脚本
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
def data_quality_check(spark, check_date):
"""
执行数据质量检查
"""
quality_metrics = {}
# 检查数据完整性
completeness_check = spark.sql(f"""
SELECT
table_name,
COUNT(*) as total_count,
SUM(CASE WHEN user_id IS NULL THEN 1 ELSE 0 END) as null_user_count,
SUM(CASE WHEN doctor_id IS NULL THEN 1 ELSE 0 END) as null_doctor_count,
(SUM(CASE WHEN user_id IS NULL THEN 1 ELSE 0 END) / COUNT(*)) as null_user_ratio
FROM ods_medical_consultation
WHERE dt = '{check_date}'
GROUP BY table_name
""")
# 检查数据一致性
consistency_check = spark.sql(f"""
SELECT
'consultation' as check_type,
COUNT(DISTINCT consultation_id) as distinct_consultations,
COUNT(*) as total_records,
COUNT(DISTINCT consultation_id) / COUNT(*) as duplication_ratio
FROM ods_medical_consultation
WHERE dt = '{check_date}'
""")
# 检查业务逻辑合理性
business_logic_check = spark.sql(f"""
SELECT
'fee_check' as check_type,
COUNT(*) as total_consultations,
SUM(CASE WHEN consultation_fee < 0 THEN 1 ELSE 0 END) as negative_fee_count,
SUM(CASE WHEN consultation_fee > 1000 THEN 1 ELSE 0 END) as abnormal_fee_count
FROM ods_medical_consultation
WHERE dt = '{check_date}'
""")
# 收集质量指标
quality_metrics['completeness'] = completeness_check.collect()
quality_metrics['consistency'] = consistency_check.collect()
quality_metrics['business_logic'] = business_logic_check.collect()
return quality_metrics
def generate_quality_report(metrics, check_date):
"""
生成数据质量报告
"""
report = f"""
数据质量检查报告
检查日期: {check_date}
生成时间: {datetime.now()}
1. 完整性检查:
- 总记录数: {metrics['completeness'][0]['total_count']}
- 用户ID空值率: {metrics['completeness'][0]['null_user_ratio']:.2%}
2. 一致性检查:
- 重复数据比例: {metrics['consistency'][0]['duplication_ratio']:.2%}
3. 业务逻辑检查:
- 异常费用记录: {metrics['business_logic'][0]['abnormal_fee_count']}
"""
return report
数据应用与可视化
业务指标计算
-- 关键业务指标查询
-- 1. 每日问诊统计
SELECT
dd.actual_date as stat_date,
COUNT(*) as consultation_count,
COUNT(DISTINCT user_key) as unique_users,
COUNT(DISTINCT doctor_key) as active_doctors,
AVG(consultation_duration) as avg_duration,
SUM(consultation_fee) as total_revenue
FROM fact_consultation fc
JOIN dim_date dd ON fc.date_key = dd.date_key
WHERE dd.actual_date >= DATE_SUB(CURRENT_DATE, 30)
GROUP BY dd.actual_date
ORDER BY stat_date DESC;
-- 2. 科室问诊排名
SELECT
department_name,
COUNT(*) as consultation_count,
AVG(user_rating) as avg_rating,
SUM(consultation_fee) as total_revenue,
COUNT(DISTINCT user_key) as served_users
FROM fact_consultation fc
JOIN dim_department dd ON fc.department_key = dd.department_key
WHERE fc.dt = '2024-01-20'
GROUP BY department_name
ORDER BY consultation_count DESC
LIMIT 10;
-- 3. 用户行为分析
SELECT
behavior_type,
COUNT(*) as behavior_count,
COUNT(DISTINCT user_key) as unique_users,
AVG(page_stay_duration) as avg_stay_duration
FROM fact_user_behavior
WHERE dt = '2024-01-20'
GROUP BY behavior_type
ORDER BY behavior_count DESC;
项目总结与价值
技术收获
- 架构设计能力:掌握Lambda架构在医疗场景的应用
- 数据建模技能:熟练运用维度建模方法
- ETL开发经验:具备完整的数据 pipeline 开发能力
- 质量保障意识:建立数据质量管理体系
业务价值
- 运营分析:支持问诊业务的多维度分析
- 用户洞察:理解用户行为模式,优化产品体验
- 医生评估:建立医生服务质量的评估体系
- 决策支持:为管理层提供数据驱动的决策依据
通过尚硅谷的《线上问诊之离线数仓》项目实战,学习者能够全面掌握大数据仓库的建设方法,为医疗行业数字化转型提供坚实的数据基础架构支持。