1 底层引擎

可以实现数据交换的大户数据处理引擎包括 sqoop,flink,dataX(阿里开源)

2 数据交换链路

2.1 Mysql2hive

以mysql2hive为例,这个组件使用的是sqoop工具来实现

sqoop的架构图

// 抽数语句
2022-05-16 10:09:58,423] {xxx_to_s3_job.py:231} INFO - sqoop import -Dorg.apache.sqoop.splitter.allow_text_splitter=true -Dmapreduce.job.user.classpath.first=true -Dmapreduce.job.name=SQOOP_IMPORT_RUGAL_PLATFORM_HOURLY_ods_mysql_to_hive_all_type_case_hourly -Dmapreduce.map.maxattempts=1 -Dmapred.map.max.attempts=1 -Dmapreduce.map.speculative=false -Dmapred.map.tasks.speculative.execution=false -Dmapreduce.map.memory.mb=2048 -Dmapreduce.map.java.opts=-Xmx1536m --connect jdbc:mysql://10.0.25.26:33071/mysql_to_hive --username mysql_to_hive_rw --password 'xxxx' --split-by id --num-mappers 5 --query "select * from all_type_case where id > 1 and \$CONDITIONS;" --delete-target-dir --as-avrodatafile --compression-codec deflate --target-dir /external_export/data/ods/mysql/10.0.25.26/33071/mysql_to_hive/xxxx/20220516/hh=09/

通过sqoop的抽数工具,数据输出到一个hdfs文件上然后通过distcp转移到s3源,hive直接加载s3源的数据。

graph TD
Mysql[(Database)] --> |sqoop import|Hdfs --> |distcp| S3
hive[(Database)] -->|load| S3

distCp语义

10:10:25,405] {xxx_to_s3_job.py:305} INFO - hadoop distcp -Dmapreduce.job.name=distcp_RUGAL_PLATFORM_HOURLY_mysql_to_hive_all_type_case_hourly -Dmapreduce.task.timeout=60000 -Dfs.s3a.endpoint=s3.cn-north-1.amazonaws.com.cn -Dfs.s3a.attempts.maximum=10 -Dfs.s3a.connection.maximum=20 -Dfs.s3a.threads.keepalivetime=60 -Dmapreduce.job.queuename=adhoc -Dyarn.resourcemanager.am.max-attempts=4 -Dmapreduce.am.max-attempts=4 -strategy=dynamic -m=20 /external_export/data/ods/mysql/10.0.25.26/33071/mysql_to_hive/xxx/20220516/hh=09/ s3a://xxx/ods/mysql/10.0.25.26/mysql_to_hive/ods_mysql_to_hive_all_type_case_hourly/dtm=20220516/hh=09/

2.2 Mongo2hive

用户流程图

数据流向图

请求flink引擎的信息

{
  'baichuan_rocketmqtocos_path': 'cosn://rugal-1251524319/baichuan_mongo_oid/sns_athena/athena.chatroom_process_record_hourly/dtm=20220516/hh=11',
  'rocketmq_consumer': 'data-wp-consumer',
  'rocketmq_topic_name': u 'db-athena_chatroom_process_record-sns_athena-1628939334000',
  'today_timestamp': u '1652673660000',
  'mongo_collection_name_full': u 'athena.chatroom_process_record',
  'job_name': 'mq2oid-sns_athena-athena-chatroom_process_record-20220516-11',
  'yesterday_timestamp': u '1652670000000',
  'rocketmq_topic_namesrv': u 'red-rocketmq-namesrv.int.xxx.com:9876'
}

Flink 抽MQ数据到COS任务的flink SQL

set `job_name` = 'mq2oid-sns_athena-athena-chatroom_process_record-20220516-11';
set `disableChaining` = true;
create table sourceOplog (
  ns VARCHAR,
  `timestamp` BIGINT,
  op VARCHAR,
  doc VARCHAR,
                               fromMigrate BOOLEAN,
  proc_time as PROCTIME()
) with (
  'connector.type' = 'rocketMQ',
 'connector.nameserver.address' = 'xxx-rocketmq-namesrv.int.xiaohongshu.com:9876',
  'connector.topic' = 'db-athena_chatroom_process_record-sns_athena-1628939334000',
  'connector.group' = 'data-wp-consumer',
  'connector.rocketMQ.async' = 'true',
  'connector.offset.property' = 'oplog_ts',  -- born, store, property
  'connector.offset.start' = '1652670000000', 
  'connector.offset.end' = '1652673660000',  -- 解析 oplog
  'format.type' = 'protobuf',
  'format.protobuf.business' = 'protobuf',
  'format.protobuf.message.name' = 'Oplog',
  'format.protobuf.skip.bytes' = '4',
   'format.property-version' = '1',
  'format.protobuf.external.message' = 'true',
  'format.protobuf.name' = 'oplog.proto'
  -- 'format.type' = 'raw'
);
create table sinkOplog (
  oid STRING
) with(
  'connector' = 'filesystem',
  'format' = 'parquet',
  'path' = 'cosn://rugal-1251524319/baichuan_mongo_oid/sns_athena/athena.chatroom_process_record_hourly/dtm=20220516/hh=11',
  'sink.file-suffix' = '.snappy.parquet',
  'parquet.compression' = 'snappy'
);
 
insert into sinkOplog
select red_json_value(doc, '$._id.\\$oid') as oid
from sourceOplog where ns = 'athena.chatroom_process_record'

flink 全量 sql

set
   `job_name` = '${job_name}';
set
  `disableChaining` = true;
CREATE TABLE sourcMongo (doc VARCHAR) with (
    'connector.type' = 'db_emitter',
    'connector.property-version' = '1',
    'connector.db_type' = 'mongo',
    'connector.db_uri' = '${db_uri}',
    'connector.user' = '',
    'connector.passwd' = '',
    'connector.db_name' = '${db_name}',
    'connector.tbl_name' = '${tbl_name}',
    'connector.filter_query' = '{}',
    'connector.slot.split.step' = '1000000',
    'connector.split_num_max' = '1000'
);
create table sinkBson (doc STRING) with(
    'connector' = 'filesystem',
    'path' = '${path}',
    'sink.rolling-policy.file-size' = '100MB',
    'sink.file-suffix' = '.bson.gz',
    'format' = 'bson',
    'bson.codec' = 'gzip'
  );
insert into
    sinkBson (
        select
            doc
        from
            sourcMongo
    );

flink 增量 cos2cos 任务sql

set `job_name` = 'oid2bson-sns_athena-athena-chatroom_process_record-20220516-11';
set `disableChaining` = true;
set `table.exec.shuffle-mode` = 'FORWARD_EDGES_PIPELINED';
set `xxx_cluster.restart-strategy` = 'disable';

create table sourceOid (
    `oid` STRING,
          proc_time AS PROCTIME()
) with (
        'connector' = 'filesystem',
'path' = 'cosn://rugal-1251524319/baichuan_mongo_oid/sns_athena/athena.chatroom_process_record_hourly/dtm=20220516/hh=11',
'format' = 'parquet',
'parquet.compression' = 'SNAPPY',
'sink.rolling-policy.file-size' = '128MB',
'sink.rolling-policy.check-interval' = '10 min',
'sink.partition-commit.trigger' = 'partition-time',
'sink.partition-commit.delay' = '1 d',
'sink.partition-commit.policy.kind' = 'success-file',
'sink.file-suffix' = '.snappy.parquet',
'partition.time-extractor.kind' = 'custom',
'partition.time-extractor.class' = 'xxx.data.platform.flink.parser.NoHyphenPartTimeExtractor'
);
create table ods_mongo_lookup_v0 (_id VARCHAR, docs VARCHAR) with (
        'connector.type' = 'mongo',
'connector.mode' = 'MERGE',
'connector.url' = 'mongodb://dist_user:oh5WJyJJmRhh@mongos-search-30003-01:30003,mongos-search-30003-02:30003,mongos-abathur-01:30003,mongos-abathur-02:30003/?authSource=admin&readPreference=secondaryPreferxxx',
'connector.sync.mode' = 'async',
'connector.database' = 'athena',
'connector.collection' = 'chatroom_process_record',
'connector.property-version' = '1',
'connector.rl.global.rate' = '10000'
);
create table sinkBson (doc STRING) with(
        'connector' = 'filesystem',
'path' = 'cosn://rugal-1251524319/baichuan_mongo/sns_athena/athena.chatroom_process_record_hourly/dtm=20220516/hh=11',
'sink.rolling-policy.file-size' = '100MB',
'sink.file-suffix' = '.bson.gz',
'format' = 'bson',
'bson.codec' = 'gzip'
);
create table sinkMissOidParquet (`oid` STRING) with (
        'connector' = 'filesystem',
'path' = 'cosn://rugal-1251524319/baichuan_mongo_lookup_miss/sns_athena/athena.chatroom_process_record_hourly/dtm=20220516/hh=11',
'sink.rolling-policy.file-size' = '100MB',
'sink.file-suffix' = '.snappy.parquet',
'format' = 'parquet',
'parquet.compression' = 'SNAPPY'
);
CREATE TEMPORARY VIEW v_oid_lookup_full
AS
SELECT
T.oid,
RED_JSON_VALUE(D.docs, '$.0') as doc
from
(
        select
        *
    from
    (
    select
    *,
    ROW_NUMBER() OVER (
    PARTITION BY oid
ORDER BY
proc_time ASC
) as rn
from
sourceOid
) T0
WHERE
T0.rn = 1
) T
LEFT JOIN ods_mongo_lookup_v0 FOR SYSTEM_TIME AS OF T.proc_time AS D ON T.oid = D._id;

INSERT INTO sinkBson
SELECT doc
FROM v_oid_lookup_full
WHERE
doc IS NOT null
AND doc <> '';

INSERT INTO sinkMissOidParquet
SELECT oid
FROM v_oid_lookup_full
WHERE doc IS NULL OR CHAR_LENGTH(doc) < 10;

[2022-05-16 00:21:22,684] {distcp_operator.py:146} INFO - Running command: ssh -o StrictHostKeyChecking=false -o ServerAliveInterval=10 -o ServerAliveCountMax=100  hadoop@10.0.7.14 '''source /etc/profile
set -e

hdfs dfs  -test -e cosn://rugal-1251524319/baichuan_mongofull/cs/csqc.qc_result/dtm=20220515 || { echo "error: from_path not found, exit "; exit 1; }
size=`hdfs dfs -du -s cosn://rugal-1251524319/baichuan_mongofull/cs/csqc.qc_result/dtm=20220515 | cut -d " " -f1 2>&1 `
aws cloudwatch put-metric-data --metric-name DistCpSize --namespace QC_EMR --timestamp $(date +%s) --unit Bytes --value $size --dimensions taskName=mongotohive_m2hfull_cs_csqc.qc_result,Project=dataflow,Cluster=SH4-Data-HDFS

hadoop distcp -Dmapreduce.job.name=mongotohive_m2hfull_cs_csqc.qc_result:ailian:20220515T0000 -Dmapreduce.task.timeout=1800000 -Dmapreduce.map.memory.mb=5120 -Dmapreduce.map.java.opts=-Xmx4096m -Dfs.s3a.fast.upload=true -Dfs.s3a.fast.buffer=array -Dfs.s3a.threads.keepalivetime=60 -Dfs.s3a.fast.buffer.size=157286400 -Dfs.s3a.multipart.size=314572800 -Dfs.s3a.multipart.threshold=1073741824 -update -m 150 cosn://rugal-1251524319/baichuan_mongofull/cs/csqc.qc_result/dtm=20220515 s3a://xhs.alpha/ods/mongofull/cs/csqc.qc_result/dtm=20220515

参考链接

文档描述图形 mermaid-js.github.io/mermaid/#/.…

数据交换xxx2xxx原理分析

1 底层引擎

2 数据交换链路

2.1 Mysql2hive

2.2 Mongo2hive

参考链接