原始json如下:
{
"id": "iotd-62020dba-e4a5-4600-9289-0eab2263158e",
"owner":"EjuxITmH",
"properties": {
"telemetry": {
"1002_HW_AI010202_PV": {
"ts": 1658387937620,
"value": 28.345173
},
"1003_HW_AI010204_PV": {
"ts": 1658387937620,
"value": 28
},
"1004_HW_AI010205_PV": {
"ts": 1658387937620,
"value": 255
}
}
},
"subscribe_id":"iotd-62020dba-e4a5-46G0-9289-0eab2263158e2_mdmp-top1c"
}
想要达到的效果:三行六列
[iotd-62020dba-e4a5-4600-9289-0eab2263158e, EjuxITmH, iotd-62020dba-e4a5-46G0-9289-0eab2263158e2_mdmp-top1c, 1003_HW_AI010204_PV, 1658387937620, 28.0]
[iotd-62020dba-e4a5-4600-9289-0eab2263158e, EjuxITmH, iotd-62020dba-e4a5-46G0-9289-0eab2263158e2_mdmp-top1c, 1004_HW_AI010205_PV, 1658387937620, 255.0]
[iotd-62020dba-e4a5-4600-9289-0eab2263158e, EjuxITmH, iotd-62020dba-e4a5-46G0-9289-0eab2263158e2_mdmp-top1c, 1002_HW_AI010202_PV, 1658387937620, 28.345173]
- 从kafka加载数据,结果要跟sql对应上。
CREATE TABLE kafka_source (
id STRING,
owner STRING,
properties ROW<telemetry MAP<STRING,ROW<ts BIGINT,`value` DOUBLE>>>,
subscribe_id STRING
) WITH (
'connector' = 'kafka',
'topic' = 'json',
'properties.bootstrap.servers' = '192.168.100.30:9092',
'properties.group.id' = 'testGroup',
'format' = 'json',
'scan.startup.mode' = 'latest-offset',
'json.fail-on-missing-field' = 'false',
'json.ignore-parse-errors' = 'true'
);
- 编写udf处理加载的数据,这个udf的作用主要是转换properties结构。
@udf(result_type=DataTypes.ARRAY(DataTypes.ROW(row_fields=[DataTypes.FIELD("eid", DataTypes.STRING()), DataTypes.FIELD("ts", DataTypes.BIGINT()), DataTypes.FIELD("value", DataTypes.DOUBLE())])))
def parse(r: Row):
m = r['telemetry']
l = []
for i in m:
trow = Row(eid=i, ts=m[i]['ts'], value=m[i]['value'])
l.append(trow)
return l
- 使用udf将数据写入到新表
新表结构:
create table kafka_sink(
id STRING,
`owner` STRING,
telemetry ARRAY<ROW<eid STRING,ts BIGINT,`value` DOUBLE>>,
subscribe_id STRING
)WITH (
'connector' = 'kafka',
'topic' = 'flink',
'properties.bootstrap.servers' = '192.168.100.30:9092',
'properties.group.id' = 'testGroup',
'format' = 'json',
'scan.startup.mode' = 'latest-offset',
'json.fail-on-missing-field' = 'false',
'json.ignore-parse-errors' = 'true'
);
调用udf,写入新表:
insert into kafka_sink select id,`owner`,parse(properties) as telemetry,subscribe_id from kafka_source;
- 创建sink表,写入输出结果
create table printSink(
id STRING,
`owner` STRING,
subscribe_id STRING,
eid STRING,
ts BIGINT,
`value` DOUBLE
)WITH (
'connector' = 'print'
);
写入结果:
insert into printSink select id,`owner`,subscribe_id,eid,ts,`value` from kafka_sink, UNNEST(telemetry) AS t (eid,ts,`value`);
完整代码: SQL:
CREATE TEMPORARY FUNCTION parse AS 'v4.parse' LANGUAGE PYTHON;
SET sql-client.execution.result-mode=TABLEAU;
CREATE TABLE kafka_source (
id STRING,
owner STRING,
properties ROW<telemetry MAP<STRING,ROW<ts BIGINT,`value` DOUBLE>>>,
subscribe_id STRING
) WITH (
'connector' = 'kafka',
'topic' = 'json',
'properties.bootstrap.servers' = '192.168.100.30:9092',
'properties.group.id' = 'testGroup',
'format' = 'json',
'scan.startup.mode' = 'latest-offset',
'json.fail-on-missing-field' = 'false',
'json.ignore-parse-errors' = 'true'
);
create table kafka_sink(
id STRING,
`owner` STRING,
telemetry ARRAY<ROW<eid STRING,ts BIGINT,`value` DOUBLE>>,
subscribe_id STRING
)WITH (
'connector' = 'kafka',
'topic' = 'flink',
'properties.bootstrap.servers' = '192.168.100.30:9092',
'properties.group.id' = 'testGroup',
'format' = 'json',
'scan.startup.mode' = 'latest-offset',
'json.fail-on-missing-field' = 'false',
'json.ignore-parse-errors' = 'true'
);
create table printSink(
id STRING,
`owner` STRING,
subscribe_id STRING,
eid STRING,
ts BIGINT,
`value` DOUBLE
)WITH (
'connector' = 'print'
);
insert into kafka_sink select id,`owner`,parse(properties) as telemetry,subscribe_id from kafka_source;
insert into printSink select id,`owner`,subscribe_id,eid,ts,`value` from kafka_sink, UNNEST(telemetry) AS t (eid,ts,`value`);
python udf
# import logging
from pyflink.table import Row
from pyflink.table.udf import udf
from pyflink.table import DataTypes
@udf(result_type=DataTypes.ARRAY(DataTypes.ROW(row_fields=[DataTypes.FIELD("eid", DataTypes.STRING()), DataTypes.FIELD("ts", DataTypes.BIGINT()), DataTypes.FIELD("value", DataTypes.DOUBLE())])))
def parse(r: Row):
m = r['telemetry']
l = []
for i in m:
trow = Row(eid=i, ts=m[i]['ts'], value=m[i]['value'])
l.append(trow)
return l