StarRocks从本地文件系统导入数据实战

112 阅读4分钟

从本地文件系统导入数据到StarRocks:

  • 同步导入:Stream Load,使用HTTP PUT协议;支持格式:CSV、JSON;文件大小不超过10GB。

  • 异步导入: Broker Load,使用MySQL协议;支持格式:Parquet、ORC、CSV、JSON;文件大小超过10GB。

案例1将本地文件系统中的一份行政区划代码数据导入到StarRocks中

1.1 数据文件

数据文件名region_code.csv,由3列数据组成,分别是:代码,地区。样本如下:

110000, 北京市							
110101, 东城区							
110102, 西城区							
110105, 朝阳区							
110106, 丰台区							
110107, 石景山区							
110108, 海淀区
110109, 门头沟区							
110111, 房山区							
110112, 通州区							
110113, 顺义区							
110114, 昌平区							
110115, 大兴区

1.2 创建数据库和表

--建库
CREATE DATABASE IF NOT EXISTS testdb;
USE testdb;
--建表,表类型为主键表
CREATE TABLE `region_dim`
(
    `id` bigint NOT NULL AUTO_INCREMENT COMMENT "自增ID",
    `code` int NOT NULL COMMENT "区域代码",
    `name` varchar(30)) NULL COMMENT "区域名称"
)
ENGINE=OLAP
PRIMARY KEY(`code`)
DISTRIBUTED BY HASH(`id`)
--创建具有自增列的表时,必须设置 'replicated_storage' = 'true',以确保所有副本具有相同的自增 IDPROPERTIES("replicated_storage" = "true");

1.3 数据导入

1.3.1 Stream Load导入

curl --location-trusted -u <username>:<password> -H "label:region_dim_import" \
    -H "Expect:100-continue" \
    -H "column_separator:," \
    -H "columns: code, name" \
    -T region_code.csv -XPUT \
    http://<fe_host>:<fe_http_port>/api/testdb/region_dim/_stream_load

1.3.2 Broker Load导入

--清空表内数据
TRUNCATE TABLE testdb.region_dim;
--导入数据
LOAD LABEL testdb.region_dim_bl
(
    DATA INFILE("file:///home/disk1/business/csv/region_code.csv")
    INTO TABLE region_dim
    COLUMNS TERMINATED BY ","
    (code, name)
)
WITH BROKER
PROPERTIES
(
    "timeout" = "3600"
);

在 v3.1 及更高版本中,您可以从 information_schema.loads 视图中查看 Broker Load 作业的进度:

SELECT * FROM information_schema.loads where LABEL = "region_dim_bl"

案例2将本地文件系统中的一份天气数据导入到StarRocks中

2.1 数据文件

数据文件名称weather_data.json,样本如下:

{    "status":0,
     "results": [
     {
        "location":{
            "country":"中国",
            "province":"北京市",
            "city":"北京市",
            "name":"东城",
            "id":"110101"
        },
        "now":{
            "temp":4,
            "feels_like":1,
            "rh":73,
            "wind_class":"2级",
            "wind_dir":"东风",
            "text":"多云",
            "prec_1h":0,
            "clouds":999999,
            "vis":3471,
            "aqi":140,
            "pm25":107,
            "pm10":0,
            "no2":23,
            "so2":22,
            "o3":70,
            "co":1.7,
            "uptime":"20200220143500"
        },
        "indexes":[
            {
                "name":"晨练指数",
                "brief":"较适宜",
                "detail":"天气阴沉,请避免在林中晨练。"
            },
            {
                "name":"洗车指数",
                "brief":"适宜",
                "detail":"天气较好,适合擦洗汽车。"
            }
        ],
        "alerts":[
            {
                "type": "道路冰雪",
                "level": "蓝色预警",
                "title": "市气象局发布道路冰雪蓝色预警[IV级/一般]", 
                "desc": "市气象局发布道路冰雪蓝色预警信号:受降雪天气影响,预计未来 24 小时我市将出现对交通有影响的道路结冰或积雪,请有关部门及广大群众做好防范工作。"
            }
        ],
        "forecasts":[
            {
                "date":"2020-02-20",
                "week":"星期四",
                "high":7,
                "low":-2,
                "wc_day":"<3级",
                "wc_night":"<3级",
                "wd_day":"东南风",
                "wd_night":"北风",
                "text_day":"多云",
                "text_night":"阴",
                "aqi":93
            }],
        "forecast_hours":[
            {
                "text":"晴",
                "temp_fc":14,
                "wind_class":"3~4级",
                "wind_dir":"西南风",
                "rh":15,
                "prec_1h":0,
                "clouds":10,
                "data_time":"2020-04-01 16:00:00"
            },
            {
                "text":"晴",
                "temp_fc":14,
                "wind_class":"3~4级",
                "wind_dir":"西南风",
                "rh":13,
                "prec_1h":0,
                "clouds":10,
                "data_time":"2020-04-01 17:00:00"
            }]
    }],
    "message":"success"
}

2.2 创建数据库和表

--建库
CREATE DATABASE IF NOT EXISTS testdb;
USE testdb;
--建表,表类型为主键表
CREATE TABLE `weather_data`
(
    `code` bigint NOT NULL COMMENT "区域代码",
    `name` varchar(30) NOT NULL COMMENT "区域名称",
    `city` varchar(30) NOT NULL COMMENT "城市",
    `province` varchar(30) NOT NULL COMMENT "省份",
    `country` varchar(30) NOT NULL COMMENT "国家",
    `now` JSON NULL COMMENT "今日天气",
    `indexes` ARRAY<JSON> NULL COMMENT "天气指数",
    `alerts` ARRAY<JSON> NULL COMMENT "天气预警",
    `forecasts` ARRAY<JSON> NULL COMMENT "天气预报",
    `forecast_hours` ARRAY<JSON> NULL COMMENT "小时天气预报"
)
ENGINE=OLAP
PRIMARY KEY(`code`,`name`,`city`,`province`)
DISTRIBUTED BY HASH(`code`);

2.3 数据导入

2.3.1 Stream Load导入

curl -v --location-trusted -u <username>:<password> -H "strict_mode: true" \
    -H "Expect:100-continue" \
    -H "format: json" \
    -H "jsonpaths: [\"$.results[*].location.id\", \"$.results[*].location.name\", \"$.results[*].location.city\", \"$.results[*].location.province\", \"$.results[*].location.country\", \"$.results[*].now\", \"$.results[*].indexes\", \"$.results[*].alerts\", \"$.results[*].forecasts\", \"$.results[*].forecast_hours\"]" \
    -H "columns: code, name, city, province, country, now, indexes, alerts, forecasts, forecast_hours" \
    -T weather_data.json \
    -XPUT http://<fe_host>:<fe_http_port>/api/testdb/weather_data/_stream_load

2.3.2 Broker Load导入

--清空表内数据
TRUNCATE TABLE testdb.weather_data;
--导入数据
LOAD LABEL testdb.weather_data_bl
(
    DATA INFILE("file:///home/disk1/business/json/weather_data.json")
    INTO TABLE weather_data
    FORMAT AS "json"
    (code, name, city, province, country, now, indexes, alerts, forecasts, forecast_hours)
)
WITH BROKER
PROPERTIES
(
    "json_root"="$.results",
    "strip_outer_array" = "true",    
    "jsonpaths" = "[\"$.location.id\", \"$.location.name\", \"$.location.city\", \"$.location.province\", \"$.location.country\", \"$.now\", \"$.indexes\", \"$.alerts\", \"$.forecasts\", \"$.forecast_hours\"]"
);