官网:cwiki.apache.org/confluence/…
UDF
- UDF:User Defined Function
- 内置的典型UDF:日期时间处理函数、字符串处理函数等
- 特点:一进一出
- 创建过程:
- 自定义Java类,继承
org.apache.hadoop.hive.ql.udf.generic.GenericUDF或者org.apache.hadoop.hive.ql.exec.UDF,建议使用继承GenericUDF,GenericUDF比UDF类提供了更多的功能,比如提供了单独的参数校验的方法,而且UDF类已经被标记为废弃状态。 - 对于继承自GenericUDF的类,需要重写父类的抽象方法,对于继承自UDF的类,则要手动提供evaluate()方法,经测试,方法名必须为evaluate
- 编写代码
maven依赖:
<properties>
<hive.version>3.1.2</hive.version>
</properties>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
</dependency>
代码1:
package com.arc.hive.udf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
/**
* @Description: Hive UDF,返回字符串的长度
*/
public class MyStringLengthFunction extends GenericUDF {
/**
* UDF函数初始化方法,每个UDF函数实例只会运行一次此方法
* @param inspectors 传入参数检查器
* @return
* @throws UDFArgumentException
*/
@Override
public ObjectInspector initialize(ObjectInspector[] inspectors) throws UDFArgumentException {
if (inspectors.length != 1) {
throw new UDFArgumentException("必须传入一个参数");
}
if (!inspectors[0].getCategory().equals(ObjectInspector.Category.PRIMITIVE)) {
throw new UDFArgumentException("传入的参数必须是基本数据类型");
}
// 由于函数返回 INT 类型,所以这里返回 INT 类型的 ObjectInspector
return PrimitiveObjectInspectorFactory.javaIntObjectInspector;
}
/**
* 函数的执行逻辑
* @param arguments
* @return
* @throws HiveException
*/
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
if (arguments[0].get() == null) {
return 0;
}
return arguments[0].get().toString().length();
}
/**
* Get the String to be displayed in explain.
*/
@Override
public String getDisplayString(String[] strings) {
return "";
}
}
代码2:
package com.arc.hive.udf;
import org.apache.hadoop.hive.ql.exec.UDF;
public class MyStringLengthFunction2 extends UDF {
public int evaluate(String s) {
if (s == null) {
return 0;
}
return s.length();
}
}
- 代码打包,上传到服务器本地
- 对于hive客户端,在哪台服务器执行hive客户端命令,就把jar包上传到哪台服务器
- 对于beeline客户端,要把jar包上传到hiveserver2服务所在的服务器本地
- 添加jar包到Hive Classpath
ADD JAR /home/admin/original-arc-hive-1.0.jar;
Added [/home/admin/original-arc-hive-1.0.jar] to class path
Added resources: [/home/admin/original-arc-hive-1.0.jar]
- 创建函数
-- 创建临时函数,客户端重接连接后函数就失效了
CREATE TEMPORARY FUNCTION str_len_1 AS 'com.arc.hive.udf.MyStringLengthFunction';
-- 创建永久函数,客户端重接连接后函数还会生效,需要Hive版本0.13+
CREATE FUNCTION str_len_2 AS 'com.arc.hive.udf.MyStringLengthFunction2'
USING JAR 'hdfs:///user/hive/udf/original-arc-hive-1.0.jar';
-
注意
- 创建临时函数,必须先
ADD JAR,再CREATE TEMPORARY FUNCTION - 创建永久函数,不能使用
ADD JAR,必须使用USING JAR这样的语法,如果hive warehouse的路径是本地路径,那么USING JAR后面也是一个本地路径,否则USING JAR后面必须得是一个集群的路径,比如HDFS路径
- 创建临时函数,必须先
-
使用函数
SELECT str_len_1();
Error: Error while compiling statement: FAILED: SemanticException [Error 10014]: Line 1:7 Wrong arguments 'str_len_1': 必须传入一个参数 (state=42000,code=10014)
SELECT str_len_1(100);
+------+
| _c0 |
+------+
| 3 |
+------+
SELECT str_len_1('Hello');
+------+
| _c0 |
+------+
| 5 |
+------+
SELECT str_len_2('Hello');
+------+
| _c0 |
+------+
| 5 |
+------+
UDTF
-
UDTF:User Defined Table-Generating Function
-
内置的典型UDTF:LATERAL VIEW EXPLODE()
-
特点:一进多出
-
创建过程与UDF一致,需继承
org.apache.hadoop.hive.ql.udf.generic.GenericUDTF,这里只列出代码与函数使用效果。 -
代码:
package com.arc.hive.udtf;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
import java.util.List;
/**
* @Description: 根据指定的分割符切分字符串
*/
public class MySplitFunction extends GenericUDTF {
private List<String> outputList = new ArrayList<>();
/**
* 初始化方法
* @param argOIs
* @return
* @throws UDFArgumentException
*/
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
// 定义输出数据的列名和类型
List<String> filedNames = new ArrayList<>();
List<ObjectInspector> fieldTypes = new ArrayList<>();
// 设置输出数据的列名和类型
filedNames.add("words");
fieldTypes.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(filedNames, fieldTypes);
}
/**
* 函数执行逻辑
* @param args
* @throws HiveException
*/
@Override
public void process(Object[] args) throws HiveException {
// 获取第一个参数:原始字符串
String arg = args[0].toString();
// 获取第二个参数:分割符
String delimiter = args[1].toString();
// 分割
String[] arr = arg.split(delimiter, -1);
// 输出结果
for (String word : arr) {
// outputList复用,先清空
outputList.clear();
// 添加切分后的字符串
outputList.add(word);
// 输出
forward(outputList);
}
}
/**
* 如有需要,释放资源,比如数据库连接和IO流
* @throws HiveException
*/
@Override
public void close() throws HiveException {}
}
- 测试:
ADD JAR /home/admin/original-arc-hive-1.0.jar;
CREATE TEMPORARY FUNCTION my_split AS 'com.arc.hive.udtf.MySplitFunction';
SELECT my_split('Hello,World', ',');
+--------+
| words |
+--------+
| Hello |
| World |
+--------+
-- UDTF与LATERAL VIEW一起使用
SELECT * FROM movie_info;
+-------------------+-------------------------+
| movie_info.movie | movie_info.category |
+-------------------+-------------------------+
| Cube | Thriller,Science |
| Saw | Thriller,Story,Dracula |
| Scary Movie | Comedy,Thriller |
+-------------------+-------------------------+
-- 期望输出如下数据:
+--------------+-----------+
| movie | tag |
+--------------+-----------+
| Cube | Thriller |
| Cube | Science |
| Saw | Thriller |
| Saw | Story |
| Saw | Dracula |
| Scary Movie | Comedy |
| Scary Movie | Thriller |
+--------------+-----------+
-- SQL
-- movie_info_tmp:LATERAL VIEW 产生的虚拟表的别名,必须有
-- tag:my_split()扩展后每个元素为一行,整体为一列,tag是列名,可以不写,默认为 words
-- 如果要定义扩展后的列名,函数后的格式必须为:view_name AS udtf_column_name
-- 查询时可以不写movie_info_tmp.tag,直接写tag
SELECT movie, movie_info_tmp.tag FROM movie_info LATERAL VIEW my_split(category, ',') movie_info_tmp AS tag;
+--------------+---------------------+
| movie | movie_info_tmp.tag |
+--------------+---------------------+
| Cube | Thriller |
| Cube | Science |
| Saw | Thriller |
| Saw | Story |
| Saw | Dracula |
| Scary Movie | Comedy |
| Scary Movie | Thriller |
+--------------+---------------------+
SELECT movie, tag FROM movie_info LATERAL VIEW my_split(category, ',') movie_info_tmp AS tag;
+--------------+-----------+
| movie | tag |
+--------------+-----------+
| Cube | Thriller |
| Cube | Science |
| Saw | Thriller |
| Saw | Story |
| Saw | Dracula |
| Scary Movie | Comedy |
| Scary Movie | Thriller |
+--------------+-----------+
SELECT movie, movie_info_tmp.words FROM movie_info LATERAL VIEW my_split(category, ',') movie_info_tmp;
+--------------+-----------------------+
| movie | movie_info_tmp.words |
+--------------+-----------------------+
| Cube | Thriller |
| Cube | Science |
| Saw | Thriller |
| Saw | Story |
| Saw | Dracula |
| Scary Movie | Comedy |
| Scary Movie | Thriller |
+--------------+-----------------------+
SELECT movie, movie_info_tmp.* FROM movie_info LATERAL VIEW my_split(category, ',') movie_info_tmp;
+--------------+-----------------------+
| movie | movie_info_tmp.words |
+--------------+-----------------------+
| Cube | Thriller |
| Cube | Science |
| Saw | Thriller |
| Saw | Story |
| Saw | Dracula |
| Scary Movie | Comedy |
| Scary Movie | Thriller |
+--------------+-----------------------+
UDAF
- UDAF:User Defined Aggregation Function
- 内置的典型UDAF:COUNT()、MAX()、SUM()
- 特点:多进一出
待完善...
Hive函数相关DDL
-- 创建临时函数
CREATE TEMPORARY FUNCTION function_name AS class_name;
-- 删除临时函数,在Hive-0.7.0+,如果删除的函数不存在且没有使用IF EXISTS关键字,就会报错
-- 设置 hive.exec.drop.ignorenonexistent = true,可以避免这样的报错
DROP TEMPORARY FUNCTION [IF EXISTS] function_name;
-- 创建永久函数,hive-0.13.0+
-- USING JAR = USING FILE = USING ARCHIVE
CREATE FUNCTION [db_name.]function_name AS class_name
[USING JAR|FILE|ARCHIVE 'file_uri' [, JAR|FILE|ARCHIVE 'file_uri'] ];
-- 删除永久函数,hive-0.13.0+,如果删除的函数不存在且没有使用IF EXISTS关键字,就会报错
-- 设置 hive.exec.drop.ignorenonexistent = true,可以避免这样的报错
DROP FUNCTION [IF EXISTS] function_name;
-- 重新加载函数,hive-1.2.0+
RELOAD (FUNCTIONS|FUNCTION);