Flink解析json并行列转换

1,298 阅读2分钟

原始json如下:

{
    "id": "iotd-62020dba-e4a5-4600-9289-0eab2263158e",
    "owner":"EjuxITmH",
    "properties": {
        "telemetry": {
            "1002_HW_AI010202_PV": {
                "ts": 1658387937620,
                "value": 28.345173
            },
            "1003_HW_AI010204_PV": {
                "ts": 1658387937620,
                "value": 28
            },
            "1004_HW_AI010205_PV": {
                "ts": 1658387937620,
                "value": 255
            }
        }
    },
    "subscribe_id":"iotd-62020dba-e4a5-46G0-9289-0eab2263158e2_mdmp-top1c"
}

想要达到的效果:三行六列

[iotd-62020dba-e4a5-4600-9289-0eab2263158e, EjuxITmH, iotd-62020dba-e4a5-46G0-9289-0eab2263158e2_mdmp-top1c, 1003_HW_AI010204_PV, 1658387937620, 28.0]
[iotd-62020dba-e4a5-4600-9289-0eab2263158e, EjuxITmH, iotd-62020dba-e4a5-46G0-9289-0eab2263158e2_mdmp-top1c, 1004_HW_AI010205_PV, 1658387937620, 255.0]
[iotd-62020dba-e4a5-4600-9289-0eab2263158e, EjuxITmH, iotd-62020dba-e4a5-46G0-9289-0eab2263158e2_mdmp-top1c, 1002_HW_AI010202_PV, 1658387937620, 28.345173]
  1. 从kafka加载数据,结果要跟sql对应上。
CREATE TABLE kafka_source (
  id STRING,
  owner STRING,
  properties ROW<telemetry MAP<STRING,ROW<ts BIGINT,`value` DOUBLE>>>,
  subscribe_id STRING
) WITH (
 'connector' = 'kafka',
 'topic' = 'json',
 'properties.bootstrap.servers' = '192.168.100.30:9092',
 'properties.group.id' = 'testGroup',
 'format' = 'json',
 'scan.startup.mode' = 'latest-offset',
 'json.fail-on-missing-field' = 'false',
 'json.ignore-parse-errors' = 'true' 
);
  1. 编写udf处理加载的数据,这个udf的作用主要是转换properties结构。
@udf(result_type=DataTypes.ARRAY(DataTypes.ROW(row_fields=[DataTypes.FIELD("eid", DataTypes.STRING()), DataTypes.FIELD("ts", DataTypes.BIGINT()), DataTypes.FIELD("value", DataTypes.DOUBLE())])))
def parse(r: Row):
    m = r['telemetry']
    l = []
    for i in m:
        trow = Row(eid=i, ts=m[i]['ts'], value=m[i]['value'])
        l.append(trow)
    return l
  1. 使用udf将数据写入到新表

新表结构:

create table kafka_sink(
  id STRING,
  `owner` STRING,
  telemetry ARRAY<ROW<eid STRING,ts BIGINT,`value` DOUBLE>>,
  subscribe_id STRING
)WITH (
 'connector' = 'kafka',
 'topic' = 'flink',
 'properties.bootstrap.servers' = '192.168.100.30:9092',
 'properties.group.id' = 'testGroup',
 'format' = 'json',
'scan.startup.mode' = 'latest-offset',
 'json.fail-on-missing-field' = 'false',
 'json.ignore-parse-errors' = 'true' 
);

调用udf,写入新表:

insert into kafka_sink select id,`owner`,parse(properties) as telemetry,subscribe_id from kafka_source;
  1. 创建sink表,写入输出结果
create table printSink(
  id STRING,
  `owner` STRING,
  subscribe_id STRING,
  eid STRING,
  ts BIGINT,
  `value` DOUBLE
)WITH (
 'connector' = 'print'
);

写入结果:

insert into printSink select id,`owner`,subscribe_id,eid,ts,`value`  from kafka_sink, UNNEST(telemetry) AS t (eid,ts,`value`);

完整代码: SQL:

CREATE TEMPORARY FUNCTION parse AS 'v4.parse' LANGUAGE PYTHON;
SET sql-client.execution.result-mode=TABLEAU;

CREATE TABLE kafka_source (
  id STRING,
  owner STRING,
  properties ROW<telemetry MAP<STRING,ROW<ts BIGINT,`value` DOUBLE>>>,
  subscribe_id STRING
) WITH (
 'connector' = 'kafka',
 'topic' = 'json',
 'properties.bootstrap.servers' = '192.168.100.30:9092',
 'properties.group.id' = 'testGroup',
 'format' = 'json',
 'scan.startup.mode' = 'latest-offset',
 'json.fail-on-missing-field' = 'false',
 'json.ignore-parse-errors' = 'true' 
);




create table kafka_sink(
  id STRING,
  `owner` STRING,
  telemetry ARRAY<ROW<eid STRING,ts BIGINT,`value` DOUBLE>>,
  subscribe_id STRING
)WITH (
 'connector' = 'kafka',
 'topic' = 'flink',
 'properties.bootstrap.servers' = '192.168.100.30:9092',
 'properties.group.id' = 'testGroup',
 'format' = 'json',
'scan.startup.mode' = 'latest-offset',
 'json.fail-on-missing-field' = 'false',
 'json.ignore-parse-errors' = 'true' 
);

create table printSink(
  id STRING,
  `owner` STRING,
  subscribe_id STRING,
  eid STRING,
  ts BIGINT,
  `value` DOUBLE
)WITH (
 'connector' = 'print'
);

insert into kafka_sink select id,`owner`,parse(properties) as telemetry,subscribe_id from kafka_source;

insert into printSink select id,`owner`,subscribe_id,eid,ts,`value`  from kafka_sink, UNNEST(telemetry) AS t (eid,ts,`value`);

python udf

# import logging
from pyflink.table import Row
from pyflink.table.udf import udf
from pyflink.table import DataTypes


@udf(result_type=DataTypes.ARRAY(DataTypes.ROW(row_fields=[DataTypes.FIELD("eid", DataTypes.STRING()), DataTypes.FIELD("ts", DataTypes.BIGINT()), DataTypes.FIELD("value", DataTypes.DOUBLE())])))
def parse(r: Row):
    m = r['telemetry']
    l = []
    for i in m:
        trow = Row(eid=i, ts=m[i]['ts'], value=m[i]['value'])
        l.append(trow)
    return l