由于我们在做数据迁移项目,数据迁移后需要验证原始表记录数是否与迁移后一致,需要实现一个用HIVE api来获取表记录数的功能,同事在调研后说HIVE API实现不了,只能用HIVE JDBC, 我向来不喜欢信邪,想自己找一下。
spark-hive中有一个HiceClientImpl类,我们可以尝试从这个类中发现一些蛛丝马迹,发现这个类里有一个
runSqlHive方法
override def runSqlHive(sql: String): Seq[String] = {
val maxResults = 100000
val results = runHive(sql, maxResults)
// It is very confusing when you only get back some of the results...
if (results.size == maxResults) sys.error("RESULTS POSSIBLY TRUNCATED")
results
}
其中调用了方法runHive
protected def runHive(cmd: String, maxRows: Int = 1000): Seq[String] = withHiveState {
logDebug(s"Running hiveql '$cmd'")
if (cmd.toLowerCase(Locale.ROOT).startsWith("set")) { logDebug(s"Changing config: $cmd") }
try {
val cmd_trimmed: String = cmd.trim()
val tokens: Array[String] = cmd_trimmed.split("\s+")
// The remainder of the command.
val cmd_1: String = cmd_trimmed.substring(tokens(0).length()).trim()
val proc = shim.getCommandProcessor(tokens(0), conf)
proc match {
case driver: Driver =>
val response: CommandProcessorResponse = driver.run(cmd)
// Throw an exception if there is an error in query processing.
if (response.getResponseCode != 0) {
driver.close()
CommandProcessorFactory.clean(conf)
throw new QueryExecutionException(response.getErrorMessage)
}
driver.setMaxRows(maxRows)
val results = shim.getDriverResults(driver)
driver.close()
CommandProcessorFactory.clean(conf)
results
case _ =>
if (state.out != null) {
// scalastyle:off println
state.out.println(tokens(0) + " " + cmd_1)
// scalastyle:on println
}
Seq(proc.run(cmd_1).getResponseCode.toString)
}
} catch {
case e: Exception =>
logError(
s"""
|======================
|HIVE FAILURE OUTPUT
|======================
|${outputBuffer.toString}
|======================
|END HIVE FAILURE OUTPUT
|======================
""".stripMargin)
throw e
}
}
可以看到这里调用了Driver类的run方法,于是把这个好消息转告给同事
什么?居然说不能用,没关系,我是打不死的小强,继续看下这个run方法是这么回事
public CommandProcessorResponse run(String command, boolean alreadyCompiled)
throws CommandNeedRetryException {
CommandProcessorResponse cpr = runInternal(command, alreadyCompiled);
if(cpr.getResponseCode() == 0) {
return cpr;
}
SessionState ss = SessionState.get();
if(ss == null) {
return cpr;
}
MetaDataFormatter mdf = MetaDataFormatUtils.getFormatter(ss.getConf());
if(!(mdf instanceof JsonMetaDataFormatter)) {
return cpr;
}
/*Here we want to encode the error in machine readable way (e.g. JSON)
* Ideally, errorCode would always be set to a canonical error defined in ErrorMsg.
* In practice that is rarely the case, so the messy logic below tries to tease
* out canonical error code if it can. Exclude stack trace from output when
* the error is a specific/expected one.
* It's written to stdout for backward compatibility (WebHCat consumes it).*/
try {
if(downstreamError == null) {
mdf.error(ss.out, errorMessage, cpr.getResponseCode(), SQLState);
return cpr;
}
ErrorMsg canonicalErr = ErrorMsg.getErrorMsg(cpr.getResponseCode());
if(canonicalErr != null && canonicalErr != ErrorMsg.GENERIC_ERROR) {
/*Some HiveExceptions (e.g. SemanticException) don't set
canonical ErrorMsg explicitly, but there is logic
(e.g. #compile()) to find an appropriate canonical error and
return its code as error code. In this case we want to
preserve it for downstream code to interpret*/
mdf.error(ss.out, errorMessage, cpr.getResponseCode(), SQLState, null);
return cpr;
}
if(downstreamError instanceof HiveException) {
HiveException rc = (HiveException) downstreamError;
mdf.error(ss.out, errorMessage,
rc.getCanonicalErrorMsg().getErrorCode(), SQLState,
rc.getCanonicalErrorMsg() == ErrorMsg.GENERIC_ERROR ?
org.apache.hadoop.util.StringUtils.stringifyException(rc)
: null);
}
else {
ErrorMsg canonicalMsg =
ErrorMsg.getErrorMsg(downstreamError.getMessage());
mdf.error(ss.out, errorMessage, canonicalMsg.getErrorCode(),
SQLState, org.apache.hadoop.util.StringUtils.
stringifyException(downstreamError));
}
}
catch(HiveException ex) {
console.printError("Unable to JSON-encode the error",
org.apache.hadoop.util.StringUtils.stringifyException(ex));
}
return cpr;
}
继续看runInternal方法
private CommandProcessorResponse runInternal(String command, boolean alreadyCompiled)
throws CommandNeedRetryException {
errorMessage = null;
SQLState = null;
downstreamError = null;
if (!validateConfVariables()) {
return createProcessorResponse(12);
}
HiveDriverRunHookContext hookContext = new HiveDriverRunHookContextImpl(conf, command);
// Get all the driver run hooks and pre-execute them.
List<HiveDriverRunHook> driverRunHooks;
try {
driverRunHooks = getHooks(HiveConf.ConfVars.HIVE_DRIVER_RUN_HOOKS,
HiveDriverRunHook.class);
for (HiveDriverRunHook driverRunHook : driverRunHooks) {
driverRunHook.preDriverRun(hookContext);
}
} catch (Exception e) {
errorMessage = "FAILED: Hive Internal Error: " + Utilities.getNameMessage(e);
SQLState = ErrorMsg.findSQLState(e.getMessage());
downstreamError = e;
console.printError(errorMessage + "\n"
+ org.apache.hadoop.util.StringUtils.stringifyException(e));
return createProcessorResponse(12);
}
// Reset the perf logger
PerfLogger perfLogger = PerfLogger.getPerfLogger(true);
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.DRIVER_RUN);
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TIME_TO_SUBMIT);
int ret;
if (!alreadyCompiled) {
ret = compileInternal(command);
if (ret != 0) {
return createProcessorResponse(ret);
}
} else {
// Since we're reusing the compiled plan, we need to update its start time for current run
plan.setQueryStartTime(perfLogger.getStartTime(PerfLogger.DRIVER_RUN));
}
// the reason that we set the txn manager for the cxt here is because each
// query has its own ctx object. The txn mgr is shared across the
// same instance of Driver, which can run multiple queries.
ctx.setHiveTxnManager(SessionState.get().getTxnMgr());
if (requiresLock()) {
ret = acquireLocksAndOpenTxn();
if (ret != 0) {
try {
releaseLocksAndCommitOrRollback(ctx.getHiveLocks(), false);
} catch (LockException e) {
// Not much to do here
}
return createProcessorResponse(ret);
}
}
ret = execute();
if (ret != 0) {
//if needRequireLock is false, the release here will do nothing because there is no lock
try {
releaseLocksAndCommitOrRollback(ctx.getHiveLocks(), false);
} catch (LockException e) {
// Nothing to do here
}
return createProcessorResponse(ret);
}
//if needRequireLock is false, the release here will do nothing because there is no lock
try {
releaseLocksAndCommitOrRollback(ctx.getHiveLocks(), true);
} catch (LockException e) {
errorMessage = "FAILED: Hive Internal Error: " + Utilities.getNameMessage(e);
SQLState = ErrorMsg.findSQLState(e.getMessage());
downstreamError = e;
console.printError(errorMessage + "\n"
+ org.apache.hadoop.util.StringUtils.stringifyException(e));
return createProcessorResponse(12);
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.DRIVER_RUN);
perfLogger.close(LOG, plan);
// Take all the driver run hooks and post-execute them.
try {
for (HiveDriverRunHook driverRunHook : driverRunHooks) {
driverRunHook.postDriverRun(hookContext);
}
} catch (Exception e) {
errorMessage = "FAILED: Hive Internal Error: " + Utilities.getNameMessage(e);
SQLState = ErrorMsg.findSQLState(e.getMessage());
downstreamError = e;
console.printError(errorMessage + "\n"
+ org.apache.hadoop.util.StringUtils.stringifyException(e));
return createProcessorResponse(12);
}
return createProcessorResponse(ret);
}
看到现在已经猜到了,reponse中应该携带了返回的数据,写个简单的测试用例看下怎么拿到我们想到的表记录数:
查看HiveClientImpl中的runHive方法的时候发现已经有现成的方法直接可以照猫画虎的用:
val results = shim.getDriverResults(driver)
override def getDriverResults(driver: Driver): Seq[String] = {
val res = new JArrayList[String]()
getDriverResultsMethod.invoke(driver, res)
res.asScala
}
测试用例
private Configuration hadoopConfig;
private String pricipal;
private String keyTab;
@Test
public void testDriver() throws CommandNeedRetryException, IOException {
HiveConf hiveConf = new HiveConf(SessionState.class);
hadoopConfig.forEach(it -> hiveConf.set(it.getKey(), it.getValue()));
Driver driver = new Driver(hiveConf);
SessionState localSession = new SessionState(hiveConf);
SessionState.start(localSession);
CommandProcessorResponse show_databases = driver.run("select count(*) from test");
ArrayList<Object> results = new ArrayList<>();
driver.getResults(results);
System.out.println();
}
报
java.lang.RuntimeException: org.apache.hadoop.security.AccessControlException: SIMPLE authentication is not enabled. Available: [TOKEN, KERBEROS]
需要加上KERBEROS认证,于是加上
UserGroupInformation.loginUserFromKeytab(pricipal, keyTab);
遇到新的问题
java.lang.ClassNotFoundException: com.unraveldata.dataflow.hive.hook.UnravelHiveHook
查看报错异常栈,定位到异常是由HookLoader.getHooks这个方法抛出的,从源码可知这个hook的类路径是从配置中读到的
并且debug看到这里读到的配置key是hive.exec.driver.run.hooks, 值是com.unravel.dataflow.hive.hook.UnravelHiveHook, 和报错相符,由于是本地测试,删掉这个配置,让driver走默认的hook,重新运行,可以在results中拿到执行结果.