从本地文件系统导入数据到StarRocks:
-
同步导入:Stream Load,使用HTTP PUT协议;支持格式:CSV、JSON;文件大小不超过10GB。
-
异步导入: Broker Load,使用MySQL协议;支持格式:Parquet、ORC、CSV、JSON;文件大小超过10GB。
案例1:将本地文件系统中的一份行政区划代码数据导入到StarRocks中。
1.1 数据文件
数据文件名region_code.csv,由3列数据组成,分别是:代码,地区。样本如下:
110000, 北京市
110101, 东城区
110102, 西城区
110105, 朝阳区
110106, 丰台区
110107, 石景山区
110108, 海淀区
110109, 门头沟区
110111, 房山区
110112, 通州区
110113, 顺义区
110114, 昌平区
110115, 大兴区
1.2 创建数据库和表
--建库
CREATE DATABASE IF NOT EXISTS testdb;
USE testdb;
--建表,表类型为主键表
CREATE TABLE `region_dim`
(
`id` bigint NOT NULL AUTO_INCREMENT COMMENT "自增ID",
`code` int NOT NULL COMMENT "区域代码",
`name` varchar(30)) NULL COMMENT "区域名称"
)
ENGINE=OLAP
PRIMARY KEY(`code`)
DISTRIBUTED BY HASH(`id`)
--创建具有自增列的表时,必须设置 'replicated_storage' = 'true',以确保所有副本具有相同的自增 ID。
PROPERTIES("replicated_storage" = "true");
1.3 数据导入
1.3.1 Stream Load导入
curl --location-trusted -u <username>:<password> -H "label:region_dim_import" \
-H "Expect:100-continue" \
-H "column_separator:," \
-H "columns: code, name" \
-T region_code.csv -XPUT \
http://<fe_host>:<fe_http_port>/api/testdb/region_dim/_stream_load
1.3.2 Broker Load导入
--清空表内数据
TRUNCATE TABLE testdb.region_dim;
--导入数据
LOAD LABEL testdb.region_dim_bl
(
DATA INFILE("file:///home/disk1/business/csv/region_code.csv")
INTO TABLE region_dim
COLUMNS TERMINATED BY ","
(code, name)
)
WITH BROKER
PROPERTIES
(
"timeout" = "3600"
);
在 v3.1 及更高版本中,您可以从 information_schema.loads 视图中查看 Broker Load 作业的进度:
SELECT * FROM information_schema.loads where LABEL = "region_dim_bl"
案例2:将本地文件系统中的一份天气数据导入到StarRocks中
2.1 数据文件
数据文件名称weather_data.json,样本如下:
{ "status":0,
"results": [
{
"location":{
"country":"中国",
"province":"北京市",
"city":"北京市",
"name":"东城",
"id":"110101"
},
"now":{
"temp":4,
"feels_like":1,
"rh":73,
"wind_class":"2级",
"wind_dir":"东风",
"text":"多云",
"prec_1h":0,
"clouds":999999,
"vis":3471,
"aqi":140,
"pm25":107,
"pm10":0,
"no2":23,
"so2":22,
"o3":70,
"co":1.7,
"uptime":"20200220143500"
},
"indexes":[
{
"name":"晨练指数",
"brief":"较适宜",
"detail":"天气阴沉,请避免在林中晨练。"
},
{
"name":"洗车指数",
"brief":"适宜",
"detail":"天气较好,适合擦洗汽车。"
}
],
"alerts":[
{
"type": "道路冰雪",
"level": "蓝色预警",
"title": "市气象局发布道路冰雪蓝色预警[IV级/一般]",
"desc": "市气象局发布道路冰雪蓝色预警信号:受降雪天气影响,预计未来 24 小时我市将出现对交通有影响的道路结冰或积雪,请有关部门及广大群众做好防范工作。"
}
],
"forecasts":[
{
"date":"2020-02-20",
"week":"星期四",
"high":7,
"low":-2,
"wc_day":"<3级",
"wc_night":"<3级",
"wd_day":"东南风",
"wd_night":"北风",
"text_day":"多云",
"text_night":"阴",
"aqi":93
}],
"forecast_hours":[
{
"text":"晴",
"temp_fc":14,
"wind_class":"3~4级",
"wind_dir":"西南风",
"rh":15,
"prec_1h":0,
"clouds":10,
"data_time":"2020-04-01 16:00:00"
},
{
"text":"晴",
"temp_fc":14,
"wind_class":"3~4级",
"wind_dir":"西南风",
"rh":13,
"prec_1h":0,
"clouds":10,
"data_time":"2020-04-01 17:00:00"
}]
}],
"message":"success"
}
2.2 创建数据库和表
--建库
CREATE DATABASE IF NOT EXISTS testdb;
USE testdb;
--建表,表类型为主键表
CREATE TABLE `weather_data`
(
`code` bigint NOT NULL COMMENT "区域代码",
`name` varchar(30) NOT NULL COMMENT "区域名称",
`city` varchar(30) NOT NULL COMMENT "城市",
`province` varchar(30) NOT NULL COMMENT "省份",
`country` varchar(30) NOT NULL COMMENT "国家",
`now` JSON NULL COMMENT "今日天气",
`indexes` ARRAY<JSON> NULL COMMENT "天气指数",
`alerts` ARRAY<JSON> NULL COMMENT "天气预警",
`forecasts` ARRAY<JSON> NULL COMMENT "天气预报",
`forecast_hours` ARRAY<JSON> NULL COMMENT "小时天气预报"
)
ENGINE=OLAP
PRIMARY KEY(`code`,`name`,`city`,`province`)
DISTRIBUTED BY HASH(`code`);
2.3 数据导入
2.3.1 Stream Load导入
curl -v --location-trusted -u <username>:<password> -H "strict_mode: true" \
-H "Expect:100-continue" \
-H "format: json" \
-H "jsonpaths: [\"$.results[*].location.id\", \"$.results[*].location.name\", \"$.results[*].location.city\", \"$.results[*].location.province\", \"$.results[*].location.country\", \"$.results[*].now\", \"$.results[*].indexes\", \"$.results[*].alerts\", \"$.results[*].forecasts\", \"$.results[*].forecast_hours\"]" \
-H "columns: code, name, city, province, country, now, indexes, alerts, forecasts, forecast_hours" \
-T weather_data.json \
-XPUT http://<fe_host>:<fe_http_port>/api/testdb/weather_data/_stream_load
2.3.2 Broker Load导入
--清空表内数据
TRUNCATE TABLE testdb.weather_data;
--导入数据
LOAD LABEL testdb.weather_data_bl
(
DATA INFILE("file:///home/disk1/business/json/weather_data.json")
INTO TABLE weather_data
FORMAT AS "json"
(code, name, city, province, country, now, indexes, alerts, forecasts, forecast_hours)
)
WITH BROKER
PROPERTIES
(
"json_root"="$.results",
"strip_outer_array" = "true",
"jsonpaths" = "[\"$.location.id\", \"$.location.name\", \"$.location.city\", \"$.location.province\", \"$.location.country\", \"$.now\", \"$.indexes\", \"$.alerts\", \"$.forecasts\", \"$.forecast_hours\"]"
);