案例来源:黑马程序员2023版大数据Hadoop入门教程 P73~P77
实验环境:windows10系统、16G内存、三台虚拟机(2核4G、2核1.5G、2核1.5G,均已部署HDFS集群、Yarn集群、Hive架构)、DBeaver、FinaBI
实验数据:chat_data-5W.csv(18.8M)、chat_data-30W.csv(112.5M)、chat_data-1000W.csv(3.66G)3个文件
补充知识
基于查询结果建表
CREATE TABLE tbl_name AS SELECT ...;
数据准备
-- 创建数据库
create database db_msg ;
-- 切换数据库
use db_msg ;
-- 列举数据库
show databases ;
--如果表已存在就删除
drop table if exists db_msg.tb_msg_source ;
--建表
create table db_msg.tb_msg_source(
msg_time string comment "消息发送时间",
sender_name string comment "发送人昵称",
sender_account string comment "发送人账号",
sender_sex string comment "发送人性别",
sender_ip string comment "发送人ip地址",
sender_os string comment "发送人操作系统",
sender_phonetype string comment "发送人手机型号",
sender_network string comment "发送人网络类型",
sender_gps string comment "发送人的GPS定位",
receiver_name string comment "接收人昵称",
receiver_ip string comment "接收人IP",
receiver_account string comment "接收人账号",
receiver_os string comment "接收人操作系统",
receiver_phonetype string comment "接收人手机型号",
receiver_network string comment "接收人网络类型",
receiver_gps string comment "接收人的GPS定位",
receiver_sex string comment "接收人性别",
msg_type string comment "消息类型",
distance string comment "双方距离",
message string comment "消息内容"
);
-- 加载数据,5W条数据
LOAD DATA LOCAL INPATH '/home/hadoop/tmpdata/chat_data-5W.csv'
INTO TABLE DB_MSG.TB_MSG_SOURCE ;
-- 查询数据
SELECT *
FROM tb_msg_source ;
-- 查询数据
SELECT COUNT(1)
FROM tb_msg_source ;
-- 查询前 10 条记录
SELECT MSG_TIME ,SENDER_NAME ,SENDER_IP ,SENDER_PHONETYPE ,RECEIVER_NAME ,RECEIVER_NETWORK
FROM tb_msg_source
LIMIT 10;
ETL 数据清洗
- E,Extract,抽取
- T,Transform,转换
- L,Load,加载
-- 抽取、转换
-- 问题1:当前数据中,有一些数据的字段为空,不是合法数据
SELECT MSG_TIME ,SENDER_NAME ,SENDER_GPS
FROM TB_MSG_SOURCE
WHERE LENGTH(SENDER_GPS) == 0 ;
-- 问题2:通过时间字段构建 天、小时 字段
SELECT
MSG_TIME,
DAY (MSG_TIME),
HOUR (MSG_TIME)
FROM TB_MSG_SOURCE
limit 10;
-- 问题3:从 GPS 的经纬度提取 精度、纬度
SELECT
SENDER_GPS ,
split(SENDER_GPS,',')[0] AS sender_lng,
split(SENDER_GPS,',')[1] AS sender_lat
FROM TB_MSG_SOURCE
limit 10;
-- 加载
-- 新建表
drop table tb_msg_etl;
create table db_msg.tb_msg_etl(
msg_time string comment "消息发送时间",
sender_name string comment "发送人昵称",
sender_account string comment "发送人账号",
sender_sex string comment "发送人性别",
sender_ip string comment "发送人ip地址",
sender_os string comment "发送人操作系统",
sender_phonetype string comment "发送人手机型号",
sender_network string comment "发送人网络类型",
sender_gps string comment "发送人的GPS定位",
receiver_name string comment "接收人昵称",
receiver_ip string comment "接收人IP",
receiver_account string comment "接收人账号",
receiver_os string comment "接收人操作系统",
receiver_phonetype string comment "接收人手机型号",
receiver_network string comment "接收人网络类型",
receiver_gps string comment "接收人的GPS定位",
receiver_sex string comment "接收人性别",
msg_type string comment "消息类型",
distance string comment "双方距离",
message string comment "消息内容",
msg_day string comment "消息日",
msg_hour string comment "消息小时",
sender_lng double comment "经度",
sender_lat double comment "纬度"
);
-- 综合解决上述三个问题,并向新表中插入数据
INSERT INTO TABLE tb_msg_etl
SELECT
*,
DAY (MSG_TIME) AS msg_day,
HOUR (MSG_TIME) AS msg_hour,
split(SENDER_GPS,',')[0] AS sender_lng,
split(SENDER_GPS,',')[1] AS sender_lat
FROM TB_MSG_SOURCE
WHERE LENGTH(SENDER_GPS) > 0 ;
指标计算
-- 1、统计今日总消息量
CREATE TABLE IF NOT EXISTS tb_rs_total_msg_cnt
COMMENT "每日消费总量" AS
SELECT
MSG_DAY ,
COUNT(*) AS total_msg_cnt
FROM TB_MSG_ETL
GROUP BY MSG_DAY ;
-- 2、统计今日每小时消息量、发送和接收用户数
CREATE TABLE IF NOT EXISTS tb_rs_hour_msg_cnt
COMMENT "每小时消息量趋势" AS
SELECT
MSG_HOUR,
COUNT(*) AS total_msg_cnt,
COUNT(DISTINCT SENDER_ACCOUNT) AS sender_usr_cnt ,
COUNT(DISTINCT RECEIVER_ACCOUNT) AS receiver_usr_cnt
FROM TB_MSG_ETL
GROUP BY MSG_HOUR ;
-- 3、统计今日各地区发送消息数据量
CREATE TABLE IF NOT EXISTS tb_rs_loc_cnt
COMMENT "今日各地区发送消息总量" AS
SELECT
MSG_DAY ,
SENDER_LNG ,
SENDER_LAT ,
COUNT(*) AS total_msg_cnt
FROM TB_MSG_ETL
GROUP BY MSG_DAY ,SENDER_LNG ,SENDER_LAT ;
-- 4、统计今日发送消息和接收消息的用户数
CREATE TABLE IF NOT EXISTS tb_rs_usr_cnt
COMMENT "今日发送消息人数、接受消息人数" AS
SELECT
MSG_DAY,
COUNT(DISTINCT SENDER_ACCOUNT) AS sender_usr_cnt ,
COUNT(DISTINCT RECEIVER_ACCOUNT) AS receiver_usr_cnt
FROM TB_MSG_ETL
GROUP BY MSG_DAY ;
-- 5、统计今日发送消息最多的Top10用户
CREATE TABLE IF NOT EXISTS tb_rs_sender_usr_top10
COMMENT "发送消息条数最多的TOP10用户" AS
SELECT
SENDER_NAME AS username,
COUNT(*) AS sender_msg_cnt
FROM TB_MSG_ETL
GROUP BY SENDER_NAME
ORDER BY sender_msg_cnt DESC
LIMIT 10;
-- 6、统计今日接收消息最多的Top10用户
CREATE TABLE IF NOT EXISTS tb_rs_receiver_usr_top10
COMMENT "接收消息条数最多的TOP10用户" AS
SELECT
RECEIVER_NAME AS username,
COUNT(*) AS receiver_msg_cnt
FROM TB_MSG_ETL
GROUP BY RECEIVER_NAME
ORDER BY receiver_msg_cnt DESC
LIMIT 10;
-- 7、统计发送人的手机型号分布情况
CREATE TABLE IF NOT EXISTS tb_rs_sender_phone
COMMENT "发送人的手机型号分布" AS
SELECT
SENDER_PHONETYPE,
COUNT(*) AS cnt
FROM TB_MSG_ETL
GROUP BY SENDER_PHONETYPE ;
-- 8、统计发送人的设备操作系统分布情况
CREATE TABLE IF NOT EXISTS tb_rs_sender_os
COMMENT "发送人的操作系统分布" AS
SELECT
SENDER_PHONETYPE,
COUNT(*) AS cnt
FROM TB_MSG_ETL
GROUP BY SENDER_OS ;
可视化展示
FineBI 使用指标计算得到的 8张表,使用不同的组件,进行可视化展示
# 由于FinaBI的安装在本地,使用的数据也是本地的,所以做出的可展示报表也是本地的,外网无法访问
# 提供了结果图示以供参考