初始化 HIve 元数据至 Atlas
- HiveMetaStoreBridge:离线集成 Hive 数据的作用类。
<!-- 离线集成,在 Atlas 服务器上执行下述脚本 -->
/<atlas package>/hook-bin/import-hive.sh
<!-- 脚本中实际调用了 HiveMetaStoreBridge 的 main 方法 -->
"${JAVA_BIN}" ${JAVA_PROPERTIES} -cp "${CP}" org.apache.atlas.hive.bridge.HiveMetaStoreBridge $IMPORT_ARGS
HiveMetaStoreBridge 处理流程
- HiveMetaStoreBridge:内部持有 Hive、Atlas 客户端,用于与服务端通信获取数据。
/**
* A Bridge Utility that imports metadata from the Hive Meta Store
* and registers them in Atlas.
*/
public class HiveMetaStoreBridge {
// Atlas 默认地址
private static final String DEFAULT_ATLAS_URL = "http://localhost:21000/";
// Hive 客户端
private final Hive hiveClient;
// Atlas 客户端
private final AtlasClientV2 atlasClientV2;
}
- HiveMetaStoreBridge.main():接受客户端的命令行,根据命令行判断是通过文件导入或命令行导入数据至 Atlas,文件导入处理交由 HiveMetaStoreBridgeV2 处理,命令行导入交由 hiveMetaStoreBridge 处理。
public static void main(String[] args) {
int exitCode = EXIT_CODE_FAILED;
AtlasClientV2 atlasClientV2 = null;
Options acceptedCliOptions = prepareCommandLineOptions();
try {
// 1、获取命令行
CommandLine cmd = new BasicParser().parse(acceptedCliOptions, args);
List<String> argsNotProcessed = cmd.getArgList();
if (argsNotProcessed != null && argsNotProcessed.size() > 0) {
throw new ParseException("Unrecognized arguments.");
}
if (cmd.hasOption(OPTION_HELP_SHORT)) {
printUsage(acceptedCliOptions);
exitCode = EXIT_CODE_SUCCESS;
} else {
Configuration atlasConf = ApplicationProperties.get();
String[] atlasEndpoint = atlasConf.getStringArray(ATLAS_ENDPOINT);
if (atlasEndpoint == null || atlasEndpoint.length == 0) {
atlasEndpoint = new String[] { DEFAULT_ATLAS_URL };
}
// 2、初始化客户端
if (!AuthenticationUtil.isKerberosAuthenticationEnabled()) {
String[] basicAuthUsernamePassword = AuthenticationUtil.getBasicAuthenticationInput();
atlasClientV2 = new AtlasClientV2(atlasEndpoint, basicAuthUsernamePassword);
} else {
UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
atlasClientV2 = new AtlasClientV2(ugi, ugi.getShortUserName(), atlasEndpoint);
}
boolean createZip = cmd.hasOption(OPTION_OUTPUT_FILEPATH_LONG);// 获取是否通过文件导入
if (createZip) {
// 3.1、创建 HiveMetaStoreBridgeV2 并将数据导入 Atlas
HiveMetaStoreBridgeV2 hiveMetaStoreBridgeV2 = new HiveMetaStoreBridgeV2(atlasConf, new HiveConf(), atlasClientV2);
if (hiveMetaStoreBridgeV2.exportDataToZipAndRunAtlasImport(cmd)) {
exitCode = EXIT_CODE_SUCCESS;
}
} else {
// 3.2、创建 hiveMetaStoreBridge 并将数据导入 Atlas
HiveMetaStoreBridge hiveMetaStoreBridge = new HiveMetaStoreBridge(atlasConf, new HiveConf(), atlasClientV2);
if (hiveMetaStoreBridge.importDataDirectlyToAtlas(cmd)) {
exitCode = EXIT_CODE_SUCCESS;
}
}
}
} catch(ParseException e) {
LOG.error("Invalid argument. Error: {}", e.getMessage());
System.out.println("Invalid argument. Error: " + e.getMessage());
exitCode = EXIT_CODE_INVALID_ARG;
if (!(e instanceof MissingArgumentException)) {
printUsage(acceptedCliOptions);
}
} catch(Exception e) {
LOG.error("Import Failed", e);
} finally {
if( atlasClientV2 !=null) {
atlasClientV2.close();
}
}
System.exit(exitCode);
}
- HiveMetaStoreBridge.importDataDirectlyToAtlas():根据命令行内容进行判断,1.1删除 Hive 元数据、1.2根据文件内容将 Hive 元数据导入 Atlas、1.3根据命令行将 Hive 元数据导入 Atlas,三种方式选其一,其中导入的核心方法是 importDatabases()。
public boolean importDataDirectlyToAtlas(CommandLine cmd) throws Exception {
// .....
if (deleteNonExisting) {
// 1.1、删除 Atlas 中 Hive 的元数据信息
deleteEntitiesForNonExistingHiveMetadata(failOnError, databaseToImport, tableToImport);
ret = true;
} else if (StringUtils.isNotEmpty(fileToImport)) {
// 1.2、根据文件内容将 Hive 元数据导入 Atlas
File f = new File(fileToImport);
if (f.exists() && f.canRead()) {
BufferedReader br = new BufferedReader(new FileReader(f));
String line = null;
while((line = br.readLine()) != null) {
String val[] = line.split(":"); // 内容格式[库名:表名]
if (ArrayUtils.isNotEmpty(val)) {
databaseToImport = val[0];
if (val.length > 1) {
tableToImport = val[1];
} else {
tableToImport = "";
}
importDatabases(failOnError, databaseToImport, tableToImport); // 数据库元数据导入方法
}
}
ret = true;
}
} else {
// 1.3、根据命令行将 Hive 元数据导入 Atlas
importDatabases(failOnError, databaseToImport, tableToImport); // 数据库元数据导入方法
ret = true;
}
return ret;
}
- HiveMetaStoreBridge.importDatabases():判断导入 Atlas 的数据库、表元数据范围,同时将数据库元数据通过 registerDatabase() 方法注册到 Atlas 上,再调用 importTables() 导入表元数据。
private void importDatabases(boolean failOnError, String databaseToImport, String tableToImport) throws Exception {
List<String> databaseNames = null;
if (StringUtils.isEmpty(databaseToImport) && StringUtils.isEmpty(tableToImport)) {
// 没有数据库和表信息则整库导入
databaseNames = hiveClient.getAllDatabases();
} else if (StringUtils.isEmpty(databaseToImport) && StringUtils.isNotEmpty(tableToImport)) {
// 没有数据库但有表信息,则根据表信息导入
if (isTableWithDatabaseName(tableToImport)) {
String val[] = tableToImport.split("\\.");
if (val.length > 1) {
databaseToImport = val[0];
tableToImport = val[1];
}
databaseNames = hiveClient.getDatabasesByPattern(databaseToImport);
} else {
databaseNames = hiveClient.getAllDatabases();
}
} else {
// 有数据库信息且有表信息,则不管表信息,直接整库导入
databaseNames = hiveClient.getDatabasesByPattern(databaseToImport);
}
if(!CollectionUtils.isEmpty(databaseNames)) {
for (String databaseName : databaseNames) {
AtlasEntityWithExtInfo dbEntity = registerDatabase(databaseName); // 注册数据库实体到 Atlas 上
if (dbEntity != null) {
importTables(dbEntity.getEntity(), databaseName, tableToImport, failOnError); // 表元数据导入方法
}
}
} else {
LOG.error("No database found");
System.exit(EXIT_CODE_FAILED);
}
}
- HiveMetaStoreBridge.importTables():通过内部 HiveClient 获取导入 Atlas 的表清单信息,并遍历清单调用importTable() 单表导入。
private int importTables(AtlasEntity dbEntity, String databaseName, String tblName, final boolean failOnError) throws Exception {
int tablesImported = 0;
final List<String> tableNames;
// 1、获取表清单
if (StringUtils.isEmpty(tblName)) {
tableNames = hiveClient.getAllTables(databaseName);
} else {
tableNames = hiveClient.getTablesByPattern(databaseName, tblName);
}
if(!CollectionUtils.isEmpty(tableNames)) {
LOG.info("Found {} tables to import in database {}", tableNames.size(), databaseName);
try {
// 2、遍历表清单导入 Atlas 并记录导入数量
for (String tableName : tableNames) {
int imported = importTable(dbEntity, databaseName, tableName, failOnError); // 单表导入 Atlas 方法
tablesImported += imported;
}
} finally {
if (tablesImported == tableNames.size()) {
LOG.info("Successfully imported {} tables from database {}", tablesImported, databaseName);
} else {
LOG.error("Imported {} of {} tables from database {}. Please check logs for errors during import", tablesImported, tableNames.size(), databaseName);
}
}
} else {
LOG.error("No tables to import in database {}", databaseName);
}
return tablesImported;
}
- HiveMetaStoreBridge.importTable():通过内部 HiveClient 获取表实体信息,并通过 registerTable() 注册表实体到 Atlas 上,若是外表则需要额外注册更多的实体信息,如:外表路径、外表建表SQL等信息,通过 registerInstances() 注册额外信息。
public int importTable(AtlasEntity dbEntity, String databaseName, String tableName, final boolean failOnError) throws Exception {
try {
// 1、获取 Hive 上的表元数据
Table table = hiveClient.getTable(databaseName, tableName);
AtlasEntityWithExtInfo tableEntity = registerTable(dbEntity, table); // 注册表实体到 Atlas 上
// 2、如果是外表,则创建相应的 Hive 进程实体(Atlas 中预先在 TypeSystem 中定义好的 Type,可以理解是一个事先定义好的存储对象)
if (table.getTableType() == TableType.EXTERNAL_TABLE) {
String processQualifiedName = getTableProcessQualifiedName(metadataNamespace, table); // 处理过程唯一的名称
AtlasEntityWithExtInfo processEntity = findProcessEntity(processQualifiedName);
// 2.1、进程实体不存在,则创建并补充相应信息并注册
if (processEntity == null) {
// 外表路径、建表语句等信息
String tableLocationString = isConvertHdfsPathToLowerCase() ? lower(table.getDataLocation().toString()) : table.getDataLocation().toString();
Path location = table.getDataLocation();
String query = getCreateTableString(table, tableLocationString);
// 补充进程实体信息
PathExtractorContext pathExtractorCtx = new PathExtractorContext(getMetadataNamespace(), isConvertHdfsPathToLowerCase(), awsS3AtlasModelVersion);
AtlasEntityWithExtInfo entityWithExtInfo = AtlasPathExtractorUtil.getPathEntity(location, pathExtractorCtx);
AtlasEntity pathInst = entityWithExtInfo.getEntity();
AtlasEntity tableInst = tableEntity.getEntity();
AtlasEntity processInst = new AtlasEntity(HiveDataTypes.HIVE_PROCESS.getName());
long now = System.currentTimeMillis();
processInst.setAttribute(ATTRIBUTE_QUALIFIED_NAME, processQualifiedName);
processInst.setAttribute(ATTRIBUTE_NAME, query);
processInst.setAttribute(ATTRIBUTE_CLUSTER_NAME, metadataNamespace);
processInst.setRelationshipAttribute(ATTRIBUTE_INPUTS, Collections.singletonList(AtlasTypeUtil.getAtlasRelatedObjectId(pathInst, RELATIONSHIP_DATASET_PROCESS_INPUTS)));
processInst.setRelationshipAttribute(ATTRIBUTE_OUTPUTS, Collections.singletonList(AtlasTypeUtil.getAtlasRelatedObjectId(tableInst, RELATIONSHIP_PROCESS_DATASET_OUTPUTS)));
String userName = table.getOwner();
if (StringUtils.isEmpty(userName)) {
userName = ApplicationProperties.get().getString(HIVE_USERNAME, "hive");
}
processInst.setAttribute(ATTRIBUTE_USER_NAME, userName);
processInst.setAttribute(ATTRIBUTE_START_TIME, now);
processInst.setAttribute(ATTRIBUTE_END_TIME, now);
processInst.setAttribute(ATTRIBUTE_OPERATION_TYPE, "CREATETABLE");
processInst.setAttribute(ATTRIBUTE_QUERY_TEXT, query);
processInst.setAttribute(ATTRIBUTE_QUERY_ID, query);
processInst.setAttribute(ATTRIBUTE_QUERY_PLAN, "{}");
processInst.setAttribute(ATTRIBUTE_RECENT_QUERIES, Collections.singletonList(query));
AtlasEntitiesWithExtInfo createTableProcess = new AtlasEntitiesWithExtInfo();
createTableProcess.addEntity(processInst);
if (pathExtractorCtx.getKnownEntities() != null) {
pathExtractorCtx.getKnownEntities().values().forEach(entity -> createTableProcess.addEntity(entity));
} else {
createTableProcess.addEntity(pathInst);
}
// 2.2、注册进程实体到 Atlas
registerInstances(createTableProcess);
} else {
LOG.info("Process {} is already registered", processQualifiedName);
}
}
// 3、非外表则返回成功数
return 1;
} catch (Exception e) {
LOG.error("Import failed for hive_table {}", tableName, e);
if (failOnError) {
throw e;
}
return 0;
}
}
- HiveMetaStoreBridge.registerTable():通过查询 Atlas 中记录,判断当前表实体是否已经注册,如果不存在则通过 registerInstance() 注册,存在则通过 updateInstance() 更新 Atlas 信息。
private AtlasEntityWithExtInfo registerTable(AtlasEntity dbEntity, Table table) throws AtlasHookException {
try {
// 1、通过 type + QualifiedName 查询表实体
AtlasEntityWithExtInfo ret;
AtlasEntityWithExtInfo tableEntity = findTableEntity(table);
// 2、实体不存在则注册,存在则更新
if (tableEntity == null) {
tableEntity = toTableEntity(dbEntity, table);
ret = registerInstance(tableEntity); // 注册表实体信息
} else {
LOG.info("Table {}.{} is already registered with id {}. Updating entity.", table.getDbName(), table.getTableName(), tableEntity.getEntity().getGuid());
ret = toTableEntity(dbEntity, table, tableEntity);
updateInstance(ret); // 更新表实体信息
}
// 3、返回结果
return ret;
} catch (Exception e) {
throw new AtlasHookException("HiveMetaStoreBridge.registerTable() failed.", e);
}
}
- HiveMetaStoreBridge.registerInstance():通过内部 AtlasClientV2 注册表实体信息,并将注册完后的数据返回。
private AtlasEntityWithExtInfo registerInstance(AtlasEntityWithExtInfo entity) throws Exception {
// .....
AtlasEntityWithExtInfo ret = null;
// 1、调用 Atlas Api 创建实例
EntityMutationResponse response = atlasClientV2.createEntity(entity);
List<AtlasEntityHeader> createdEntities = response.getEntitiesByOperation(EntityMutations.EntityOperation.CREATE);
// 2、组装结果并返回
if (CollectionUtils.isNotEmpty(createdEntities)) {
for (AtlasEntityHeader createdEntity : createdEntities) {
if (ret == null) {
// 2.1、通过 Guid (全局唯一标识)查询实体
ret = atlasClientV2.getEntityByGuid(createdEntity.getGuid());
LOG.info("Created {} entity: name={}, guid={}", ret.getEntity().getTypeName(), ret.getEntity().getAttribute(ATTRIBUTE_QUALIFIED_NAME), ret.getEntity().getGuid());
} else if (ret.getEntity(createdEntity.getGuid()) == null) {
// 2.1、通过 Guid (全局唯一标识)查询实体
AtlasEntityWithExtInfo newEntity = atlasClientV2.getEntityByGuid(createdEntity.getGuid());
// 2.2、实体信息添加到结果对象中
ret.addReferredEntity(newEntity.getEntity());
// 2.3、存在关联实体信息,则也添加到结果对象中
if (MapUtils.isNotEmpty(newEntity.getReferredEntities())) {
for (Map.Entry<String, AtlasEntity> entry : newEntity.getReferredEntities().entrySet()) {
ret.addReferredEntity(entry.getKey(), entry.getValue());
}
}
LOG.info("Created {} entity: name={}, guid={}", newEntity.getEntity().getTypeName(), newEntity.getEntity().getAttribute(ATTRIBUTE_QUALIFIED_NAME), newEntity.getEntity().getGuid());
}
}
}
// 3、清除结果对象中的关系属性
clearRelationshipAttributes(ret);
// 4、返回结果
return ret;
}
- HiveMetaStoreBridge.registerInstances():逻辑与 registerInstance() 一致,只是该方法为注册多实体。
private AtlasEntitiesWithExtInfo registerInstances(AtlasEntitiesWithExtInfo entities) throws Exception {
// .....
AtlasEntitiesWithExtInfo ret = null;
// 1、调用 Atlas Api 创建实例
EntityMutationResponse response = atlasClientV2.createEntities(entities);
List<AtlasEntityHeader> createdEntities = response.getEntitiesByOperation(EntityMutations.EntityOperation.CREATE);
// 2、组装结果并返回
if (CollectionUtils.isNotEmpty(createdEntities)) {
ret = new AtlasEntitiesWithExtInfo();
for (AtlasEntityHeader createdEntity : createdEntities) {
// 2.1、通过 Guid (全局唯一标识)查询实体
AtlasEntityWithExtInfo entity = atlasClientV2.getEntityByGuid(createdEntity.getGuid());
// 2.2、实体信息添加到结果对象中
ret.addEntity(entity.getEntity());
// 2.3、存在关联实体信息,则也添加到结果对象中
if (MapUtils.isNotEmpty(entity.getReferredEntities())) {
for (Map.Entry<String, AtlasEntity> entry : entity.getReferredEntities().entrySet()) {
ret.addReferredEntity(entry.getKey(), entry.getValue());
}
}
LOG.info("Created {} entity: name={}, guid={}", entity.getEntity().getTypeName(), entity.getEntity().getAttribute(ATTRIBUTE_QUALIFIED_NAME), entity.getEntity().getGuid());
}
}
// 3、清除结果对象中的关系属性
clearRelationshipAttributes(ret);
// 4、返回结果
return ret;
}
一句话总结下 HiveMetaStoreBridge 处理流程:HiveMetaStoreBridge 通过内部持有的 HiveClient 和 AtlasClientV2 与 Hive、Atlas 服务端通信获取以及注册新,同时根据用户行交互实现对 Hive 某个库中的某张表或者一整个库进行元数据注册以及更新。