1 底层引擎
可以实现数据交换的大户数据处理引擎包括 sqoop,flink,dataX(阿里开源)
2 数据交换链路
2.1 Mysql2hive
以mysql2hive为例,这个组件使用的是sqoop工具来实现
sqoop的架构图
// 抽数语句
2022-05-16 10:09:58,423] {xxx_to_s3_job.py:231} INFO - sqoop import -Dorg.apache.sqoop.splitter.allow_text_splitter=true -Dmapreduce.job.user.classpath.first=true -Dmapreduce.job.name=SQOOP_IMPORT_RUGAL_PLATFORM_HOURLY_ods_mysql_to_hive_all_type_case_hourly -Dmapreduce.map.maxattempts=1 -Dmapred.map.max.attempts=1 -Dmapreduce.map.speculative=false -Dmapred.map.tasks.speculative.execution=false -Dmapreduce.map.memory.mb=2048 -Dmapreduce.map.java.opts=-Xmx1536m --connect jdbc:mysql://10.0.25.26:33071/mysql_to_hive --username mysql_to_hive_rw --password 'xxxx' --split-by id --num-mappers 5 --query "select * from all_type_case where id > 1 and \$CONDITIONS;" --delete-target-dir --as-avrodatafile --compression-codec deflate --target-dir /external_export/data/ods/mysql/10.0.25.26/33071/mysql_to_hive/xxxx/20220516/hh=09/
通过sqoop的抽数工具,数据输出到一个hdfs文件上 然后通过distcp转移到s3源,hive直接加载s3源的数据。
graph TD
Mysql[(Database)] --> |sqoop import|Hdfs --> |distcp| S3
hive[(Database)] -->|load| S3
distCp语义
10:10:25,405] {xxx_to_s3_job.py:305} INFO - hadoop distcp -Dmapreduce.job.name=distcp_RUGAL_PLATFORM_HOURLY_mysql_to_hive_all_type_case_hourly -Dmapreduce.task.timeout=60000 -Dfs.s3a.endpoint=s3.cn-north-1.amazonaws.com.cn -Dfs.s3a.attempts.maximum=10 -Dfs.s3a.connection.maximum=20 -Dfs.s3a.threads.keepalivetime=60 -Dmapreduce.job.queuename=adhoc -Dyarn.resourcemanager.am.max-attempts=4 -Dmapreduce.am.max-attempts=4 -strategy=dynamic -m=20 /external_export/data/ods/mysql/10.0.25.26/33071/mysql_to_hive/xxx/20220516/hh=09/ s3a://xxx/ods/mysql/10.0.25.26/mysql_to_hive/ods_mysql_to_hive_all_type_case_hourly/dtm=20220516/hh=09/
2.2 Mongo2hive
用户流程图
数据流向图
请求flink引擎的信息
{
'baichuan_rocketmqtocos_path': 'cosn://rugal-1251524319/baichuan_mongo_oid/sns_athena/athena.chatroom_process_record_hourly/dtm=20220516/hh=11',
'rocketmq_consumer': 'data-wp-consumer',
'rocketmq_topic_name': u 'db-athena_chatroom_process_record-sns_athena-1628939334000',
'today_timestamp': u '1652673660000',
'mongo_collection_name_full': u 'athena.chatroom_process_record',
'job_name': 'mq2oid-sns_athena-athena-chatroom_process_record-20220516-11',
'yesterday_timestamp': u '1652670000000',
'rocketmq_topic_namesrv': u 'red-rocketmq-namesrv.int.xxx.com:9876'
}
Flink 抽MQ数据到COS任务的flink SQL
set `job_name` = 'mq2oid-sns_athena-athena-chatroom_process_record-20220516-11';
set `disableChaining` = true;
create table sourceOplog (
ns VARCHAR,
`timestamp` BIGINT,
op VARCHAR,
doc VARCHAR,
fromMigrate BOOLEAN,
proc_time as PROCTIME()
) with (
'connector.type' = 'rocketMQ',
'connector.nameserver.address' = 'xxx-rocketmq-namesrv.int.xiaohongshu.com:9876',
'connector.topic' = 'db-athena_chatroom_process_record-sns_athena-1628939334000',
'connector.group' = 'data-wp-consumer',
'connector.rocketMQ.async' = 'true',
'connector.offset.property' = 'oplog_ts', -- born, store, property
'connector.offset.start' = '1652670000000',
'connector.offset.end' = '1652673660000', -- 解析 oplog
'format.type' = 'protobuf',
'format.protobuf.business' = 'protobuf',
'format.protobuf.message.name' = 'Oplog',
'format.protobuf.skip.bytes' = '4',
'format.property-version' = '1',
'format.protobuf.external.message' = 'true',
'format.protobuf.name' = 'oplog.proto'
-- 'format.type' = 'raw'
);
create table sinkOplog (
oid STRING
) with(
'connector' = 'filesystem',
'format' = 'parquet',
'path' = 'cosn://rugal-1251524319/baichuan_mongo_oid/sns_athena/athena.chatroom_process_record_hourly/dtm=20220516/hh=11',
'sink.file-suffix' = '.snappy.parquet',
'parquet.compression' = 'snappy'
);
insert into sinkOplog
select red_json_value(doc, '$._id.\\$oid') as oid
from sourceOplog where ns = 'athena.chatroom_process_record'
flink 全量 sql
set
`job_name` = '${job_name}';
set
`disableChaining` = true;
CREATE TABLE sourcMongo (doc VARCHAR) with (
'connector.type' = 'db_emitter',
'connector.property-version' = '1',
'connector.db_type' = 'mongo',
'connector.db_uri' = '${db_uri}',
'connector.user' = '',
'connector.passwd' = '',
'connector.db_name' = '${db_name}',
'connector.tbl_name' = '${tbl_name}',
'connector.filter_query' = '{}',
'connector.slot.split.step' = '1000000',
'connector.split_num_max' = '1000'
);
create table sinkBson (doc STRING) with(
'connector' = 'filesystem',
'path' = '${path}',
'sink.rolling-policy.file-size' = '100MB',
'sink.file-suffix' = '.bson.gz',
'format' = 'bson',
'bson.codec' = 'gzip'
);
insert into
sinkBson (
select
doc
from
sourcMongo
);
flink 增量 cos2cos 任务sql
set `job_name` = 'oid2bson-sns_athena-athena-chatroom_process_record-20220516-11';
set `disableChaining` = true;
set `table.exec.shuffle-mode` = 'FORWARD_EDGES_PIPELINED';
set `xxx_cluster.restart-strategy` = 'disable';
create table sourceOid (
`oid` STRING,
proc_time AS PROCTIME()
) with (
'connector' = 'filesystem',
'path' = 'cosn://rugal-1251524319/baichuan_mongo_oid/sns_athena/athena.chatroom_process_record_hourly/dtm=20220516/hh=11',
'format' = 'parquet',
'parquet.compression' = 'SNAPPY',
'sink.rolling-policy.file-size' = '128MB',
'sink.rolling-policy.check-interval' = '10 min',
'sink.partition-commit.trigger' = 'partition-time',
'sink.partition-commit.delay' = '1 d',
'sink.partition-commit.policy.kind' = 'success-file',
'sink.file-suffix' = '.snappy.parquet',
'partition.time-extractor.kind' = 'custom',
'partition.time-extractor.class' = 'xxx.data.platform.flink.parser.NoHyphenPartTimeExtractor'
);
create table ods_mongo_lookup_v0 (_id VARCHAR, docs VARCHAR) with (
'connector.type' = 'mongo',
'connector.mode' = 'MERGE',
'connector.url' = 'mongodb://dist_user:oh5WJyJJmRhh@mongos-search-30003-01:30003,mongos-search-30003-02:30003,mongos-abathur-01:30003,mongos-abathur-02:30003/?authSource=admin&readPreference=secondaryPreferxxx',
'connector.sync.mode' = 'async',
'connector.database' = 'athena',
'connector.collection' = 'chatroom_process_record',
'connector.property-version' = '1',
'connector.rl.global.rate' = '10000'
);
create table sinkBson (doc STRING) with(
'connector' = 'filesystem',
'path' = 'cosn://rugal-1251524319/baichuan_mongo/sns_athena/athena.chatroom_process_record_hourly/dtm=20220516/hh=11',
'sink.rolling-policy.file-size' = '100MB',
'sink.file-suffix' = '.bson.gz',
'format' = 'bson',
'bson.codec' = 'gzip'
);
create table sinkMissOidParquet (`oid` STRING) with (
'connector' = 'filesystem',
'path' = 'cosn://rugal-1251524319/baichuan_mongo_lookup_miss/sns_athena/athena.chatroom_process_record_hourly/dtm=20220516/hh=11',
'sink.rolling-policy.file-size' = '100MB',
'sink.file-suffix' = '.snappy.parquet',
'format' = 'parquet',
'parquet.compression' = 'SNAPPY'
);
CREATE TEMPORARY VIEW v_oid_lookup_full
AS
SELECT
T.oid,
RED_JSON_VALUE(D.docs, '$.0') as doc
from
(
select
*
from
(
select
*,
ROW_NUMBER() OVER (
PARTITION BY oid
ORDER BY
proc_time ASC
) as rn
from
sourceOid
) T0
WHERE
T0.rn = 1
) T
LEFT JOIN ods_mongo_lookup_v0 FOR SYSTEM_TIME AS OF T.proc_time AS D ON T.oid = D._id;
INSERT INTO sinkBson
SELECT doc
FROM v_oid_lookup_full
WHERE
doc IS NOT null
AND doc <> '';
INSERT INTO sinkMissOidParquet
SELECT oid
FROM v_oid_lookup_full
WHERE doc IS NULL OR CHAR_LENGTH(doc) < 10;
[2022-05-16 00:21:22,684] {distcp_operator.py:146} INFO - Running command: ssh -o StrictHostKeyChecking=false -o ServerAliveInterval=10 -o ServerAliveCountMax=100 hadoop@10.0.7.14 '''source /etc/profile
set -e
hdfs dfs -test -e cosn://rugal-1251524319/baichuan_mongofull/cs/csqc.qc_result/dtm=20220515 || { echo "error: from_path not found, exit "; exit 1; }
size=`hdfs dfs -du -s cosn://rugal-1251524319/baichuan_mongofull/cs/csqc.qc_result/dtm=20220515 | cut -d " " -f1 2>&1 `
aws cloudwatch put-metric-data --metric-name DistCpSize --namespace QC_EMR --timestamp $(date +%s) --unit Bytes --value $size --dimensions taskName=mongotohive_m2hfull_cs_csqc.qc_result,Project=dataflow,Cluster=SH4-Data-HDFS
hadoop distcp -Dmapreduce.job.name=mongotohive_m2hfull_cs_csqc.qc_result:ailian:20220515T0000 -Dmapreduce.task.timeout=1800000 -Dmapreduce.map.memory.mb=5120 -Dmapreduce.map.java.opts=-Xmx4096m -Dfs.s3a.fast.upload=true -Dfs.s3a.fast.buffer=array -Dfs.s3a.threads.keepalivetime=60 -Dfs.s3a.fast.buffer.size=157286400 -Dfs.s3a.multipart.size=314572800 -Dfs.s3a.multipart.threshold=1073741824 -update -m 150 cosn://rugal-1251524319/baichuan_mongofull/cs/csqc.qc_result/dtm=20220515 s3a://xhs.alpha/ods/mongofull/cs/csqc.qc_result/dtm=20220515