可视化展示案例(含ETL、FineBI使用)

159 阅读5分钟

案例来源:黑马程序员2023版大数据Hadoop入门教程 P73~P77

实验环境:windows10系统、16G内存、三台虚拟机(2核4G、2核1.5G、2核1.5G,均已部署HDFS集群、Yarn集群、Hive架构)、DBeaver、FinaBI

实验数据:chat_data-5W.csv(18.8M)、chat_data-30W.csv(112.5M)、chat_data-1000W.csv(3.66G)3个文件

补充知识

基于查询结果建表

 CREATE TABLE tbl_name AS SELECT ...;

数据准备

 -- 创建数据库
 create database db_msg ;
 ​
 -- 切换数据库
 use db_msg ;
 ​
 -- 列举数据库
 show databases ;
 ​
 --如果表已存在就删除
 drop table if exists db_msg.tb_msg_source ;
 ​
 --建表
 create table db_msg.tb_msg_source(
 msg_time string comment "消息发送时间",
 sender_name string comment "发送人昵称",
 sender_account string comment "发送人账号",
 sender_sex string comment "发送人性别",
 sender_ip string comment "发送人ip地址",
 sender_os string comment "发送人操作系统",
 sender_phonetype string comment "发送人手机型号",
 sender_network string comment "发送人网络类型",
 sender_gps string comment "发送人的GPS定位",
 receiver_name string comment "接收人昵称",
 receiver_ip string comment "接收人IP",
 receiver_account string comment "接收人账号",
 receiver_os string comment "接收人操作系统",
 receiver_phonetype string comment "接收人手机型号",
 receiver_network string comment "接收人网络类型",
 receiver_gps string comment "接收人的GPS定位",
 receiver_sex string comment "接收人性别",
 msg_type string comment "消息类型",
 distance string comment "双方距离",
 message string comment "消息内容"
 );
 ​
 -- 加载数据,5W条数据
 LOAD DATA LOCAL INPATH '/home/hadoop/tmpdata/chat_data-5W.csv' 
 INTO TABLE DB_MSG.TB_MSG_SOURCE ;
 ​
 -- 查询数据
 SELECT * 
 FROM tb_msg_source ;
 ​
 -- 查询数据
 SELECT COUNT(1) 
 FROM tb_msg_source ;
 ​
 -- 查询前 10 条记录
 SELECT MSG_TIME ,SENDER_NAME ,SENDER_IP ,SENDER_PHONETYPE ,RECEIVER_NAME ,RECEIVER_NETWORK 
 FROM tb_msg_source 
 LIMIT 10;

ETL 数据清洗

  • E,Extract,抽取
  • T,Transform,转换
  • L,Load,加载
 -- 抽取、转换
 -- 问题1:当前数据中,有一些数据的字段为空,不是合法数据
 SELECT MSG_TIME ,SENDER_NAME ,SENDER_GPS 
 FROM TB_MSG_SOURCE
 WHERE LENGTH(SENDER_GPS) == 0 ;
 ​
 -- 问题2:通过时间字段构建 天、小时 字段
 SELECT 
     MSG_TIME,
     DAY (MSG_TIME),
     HOUR (MSG_TIME)
 FROM TB_MSG_SOURCE
 limit 10;
 ​
 -- 问题3:从 GPS 的经纬度提取 精度、纬度
 SELECT 
     SENDER_GPS ,
     split(SENDER_GPS,',')[0] AS sender_lng,
     split(SENDER_GPS,',')[1] AS sender_lat
 FROM TB_MSG_SOURCE
 limit 10;
 -- 加载
 -- 新建表
 drop table tb_msg_etl;
 create table db_msg.tb_msg_etl(
 msg_time string comment "消息发送时间",
 sender_name string comment "发送人昵称",
 sender_account string comment "发送人账号",
 sender_sex string comment "发送人性别",
 sender_ip string comment "发送人ip地址",
 sender_os string comment "发送人操作系统",
 sender_phonetype string comment "发送人手机型号",
 sender_network string comment "发送人网络类型",
 sender_gps string comment "发送人的GPS定位",
 receiver_name string comment "接收人昵称",
 receiver_ip string comment "接收人IP",
 receiver_account string comment "接收人账号",
 receiver_os string comment "接收人操作系统",
 receiver_phonetype string comment "接收人手机型号",
 receiver_network string comment "接收人网络类型",
 receiver_gps string comment "接收人的GPS定位",
 receiver_sex string comment "接收人性别",
 msg_type string comment "消息类型",
 distance string comment "双方距离",
 message string comment "消息内容",
 msg_day string comment "消息日",
 msg_hour string comment "消息小时",
 sender_lng double comment "经度",
 sender_lat double comment "纬度"
 );
 ​
 -- 综合解决上述三个问题,并向新表中插入数据
 INSERT INTO TABLE tb_msg_etl
 SELECT 
     *,
     DAY (MSG_TIME) AS msg_day,
     HOUR (MSG_TIME) AS msg_hour,
     split(SENDER_GPS,',')[0] AS sender_lng,
     split(SENDER_GPS,',')[1] AS sender_lat
 FROM TB_MSG_SOURCE
 WHERE LENGTH(SENDER_GPS) > 0 ;

指标计算

 -- 1、统计今日总消息量
 CREATE TABLE IF NOT EXISTS tb_rs_total_msg_cnt
 COMMENT "每日消费总量" AS
 SELECT 
     MSG_DAY ,
     COUNT(*) AS total_msg_cnt 
 FROM TB_MSG_ETL
 GROUP BY MSG_DAY ;
 ​
 -- 2、统计今日每小时消息量、发送和接收用户数
 CREATE TABLE IF NOT EXISTS tb_rs_hour_msg_cnt
 COMMENT "每小时消息量趋势" AS
 SELECT 
     MSG_HOUR,
     COUNT(*) AS total_msg_cnt,
     COUNT(DISTINCT SENDER_ACCOUNT) AS sender_usr_cnt ,
     COUNT(DISTINCT RECEIVER_ACCOUNT) AS receiver_usr_cnt
 FROM TB_MSG_ETL
 GROUP BY MSG_HOUR ;
 ​
 -- 3、统计今日各地区发送消息数据量
 CREATE TABLE IF NOT EXISTS tb_rs_loc_cnt
 COMMENT "今日各地区发送消息总量" AS
 SELECT 
     MSG_DAY ,
     SENDER_LNG ,
     SENDER_LAT ,
     COUNT(*) AS total_msg_cnt
 FROM TB_MSG_ETL
 GROUP BY MSG_DAY ,SENDER_LNG ,SENDER_LAT ;
 ​
 -- 4、统计今日发送消息和接收消息的用户数
 CREATE TABLE IF NOT EXISTS tb_rs_usr_cnt
 COMMENT "今日发送消息人数、接受消息人数" AS
 SELECT 
     MSG_DAY,
     COUNT(DISTINCT SENDER_ACCOUNT) AS sender_usr_cnt ,
     COUNT(DISTINCT RECEIVER_ACCOUNT) AS receiver_usr_cnt 
 FROM TB_MSG_ETL
 GROUP BY MSG_DAY ;
 ​
 -- 5、统计今日发送消息最多的Top10用户
 CREATE TABLE IF NOT EXISTS tb_rs_sender_usr_top10
 COMMENT "发送消息条数最多的TOP10用户" AS
 SELECT 
     SENDER_NAME AS username,
     COUNT(*) AS sender_msg_cnt 
 FROM TB_MSG_ETL
 GROUP BY SENDER_NAME 
 ORDER BY sender_msg_cnt DESC 
 LIMIT 10;
 ​
 -- 6、统计今日接收消息最多的Top10用户
 CREATE TABLE IF NOT EXISTS tb_rs_receiver_usr_top10
 COMMENT "接收消息条数最多的TOP10用户" AS
 SELECT 
     RECEIVER_NAME AS username,
     COUNT(*) AS receiver_msg_cnt 
 FROM TB_MSG_ETL
 GROUP BY RECEIVER_NAME 
 ORDER BY receiver_msg_cnt DESC 
 LIMIT 10;
 ​
 -- 7、统计发送人的手机型号分布情况
 CREATE TABLE IF NOT EXISTS tb_rs_sender_phone
 COMMENT "发送人的手机型号分布" AS
 SELECT 
     SENDER_PHONETYPE,
     COUNT(*) AS cnt
 FROM TB_MSG_ETL
 GROUP BY SENDER_PHONETYPE ;
 ​
 -- 8、统计发送人的设备操作系统分布情况
 CREATE TABLE IF NOT EXISTS tb_rs_sender_os
 COMMENT "发送人的操作系统分布" AS
 SELECT 
     SENDER_PHONETYPE,
     COUNT(*) AS cnt
 FROM TB_MSG_ETL
 GROUP BY SENDER_OS ;

可视化展示

FineBI 使用指标计算得到的 8张表,使用不同的组件,进行可视化展示

 # 由于FinaBI的安装在本地,使用的数据也是本地的,所以做出的可展示报表也是本地的,外网无法访问
 # 提供了结果图示以供参考

仪表板_page-0001.jpg