Function: Regular match returns all substrings and returns array
Regexp_extract_all (Fields: STRING, Re: string, GROUP: int), Mandatory array: String, can be used to turn a line more lines of code address: https://github.com/leeshuaichao/hive_functions
Create an MVN project
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.moxi.hive</groupId>
<artifactId>hive_udf</artifactId>
<version>1.0 the SNAPSHOT</version>
<packaging>jar</packaging>
<name>HiveUDFs</name>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>3.1.2</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
</plugins>
</build>
</project>
Copy the code
Write UDTF class
package com.moxi.hive.udf.regexp;
import com.moxi.hive.udf.utils.RegexpUtils;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.IntWritable;
Regexp_extract_all (field, re, returns the number of parentheses :0 is all) * regexp_extract_all(field, re, returns the number of parentheses :0 is all) *@author[email protected] * 2020/11/23 2:33 PM **/
public class UdtfRegexpExtractAll extends GenericUDF {
@Override
public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
// Check if two arguments were passed
if(objectInspectors.length ! =2&& objectInspectors.length ! =3) {
throw new UDFArgumentLengthException(
"The function regexp_extract_all takes exactly 2 or 3 arguments.");
}
for (int i = 0; i < 2; i++) {
if(! ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, objectInspectors[i])) {throw new UDFArgumentTypeException(i,
"\" " + PrimitiveObjectInspectorFactory.javaStringObjectInspector.getTypeName() + "\" "
+ "expected at function regexp_extract_all, but "
+ "\" " + objectInspectors[i].getTypeName() + "\" "
+ "is found"); }}if (objectInspectors.length == 3) {
if(! ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaIntObjectInspector, objectInspectors[2]) {throw new UDFArgumentTypeException(2."\" " + PrimitiveObjectInspectorFactory.javaLongObjectInspector.getTypeName() + "\" "
+ "expected at function regexp_extract_all, but "
+ "\" " + objectInspectors[2].getTypeName() + "\" "
+ "is found");
}
}
ObjectInspector expect = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
return ObjectInspectorFactory.getStandardListObjectInspector(expect);
}
@Override
public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
String source = deferredObjects[0].get().toString();
String pattern = deferredObjects[1].get().toString();
Integer groupIndex = 0;
if (deferredObjects.length == 3) {
groupIndex = ((IntWritable) deferredObjects[2].get()).get();
}
if (source == null) {
return null;
}
return RegexpUtils.findAll(pattern, source, groupIndex);
}
@Override
public String getDisplayString(String[] strings) {
assert (strings.length == 2 || strings.length == 3);
if (strings.length == 2) {
return "regexp_extract_all(" + strings[0] + ","
+ strings[1] + ")";
} else {
return "regexp_extract_all(" + strings[0] + ","
+ strings[1] + "," + strings[2] + ")"; }}}Copy the code
Regular utility class
package com.moxi.hive.udf.utils;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/** * Regular tools * [email protected] * 2020/11/23 2:45 PM **/
public class RegexpUtils {
/** * query all substrings and return list *@paramRegex regular expression *@paramContent Is the recognized character string *@paramGroup gets the contents of the parentheses, with 0 being the entire re *@return* /
public static List<String> findAll(String regex, CharSequence content, int group) {
List<String> collection = new ArrayList<>();
Pattern pattern = Pattern.compile(regex);
if (null! = content) { Matcher matcher = pattern.matcher(content);while(matcher.find()) { collection.add(matcher.group(group)); }}returncollection; }}Copy the code
Package and upload to the server for testing
Pits encountered:
It has been a long time since Maven used jar packages to create regular jars. Maven has removed the huTool class because it requires special handling to type in dependencies
Create temporary functions
Add jar package to current windowAdd the jar/home/hive/apache - hive - 3.1.2 / lib/hive_udf - 1.0 - the SNAPSHOT. Jar;Create a temporary function
create temporary function regexp_extract_all AS 'com.moxi.hive.udf.regexp.UdtfRegexpExtractAll';
Copy the code
Test temporary functions
select voice_num from ( select regexp_extract_all(ret.abc, "@#(.*?) As.vn from # @ ", 1) (select "@ # 5 # @ I want to made bearing @ # 1 of 2 # @ # @ # @ hit, reimbursement, # # #@" as ABC) ret) test parse.vn) r as voice_num;Copy the code
Delete temporary functions
drop temporary function regexp_extract_all; Delete the jar/home/hive/apache - hive - 3.1.2 / lib/hive_udf - 1.0 - the SNAPSHOT. Jar;Copy the code
Generating permanent functions
Upload the JAR package to HDFS
# Create HDFS directory
hadoop fs -mkdir /lib
Add jar to HDFSHadoop fs -put /home/hive/apache-hive-3.1.2/lib/ hive_udf-1.0-snapshot.jar /lib/# check whether the database is successfully added
hadoop fs -lsr /lib
Copy the code
Creating a permanent function
create function data_mart.regexp_extract_all AS 'com.moxi.hive.udf.regexp.UdtfRegexpExtractAll' using jar 'HDFS: / lib/hive_udf - 1.0 - the SNAPSHOT. Jar'; create function data_center.regexp_extract_all AS 'com.moxi.hive.udf.regexp.UdtfRegexpExtractAll' using jar 'HDFS: / lib/hive_udf - 1.0 - the SNAPSHOT. Jar';Copy the code
test
select voice_num from ( select regexp_extract_all(ret.abc, "@#(.*?) As.vn from # @ ", 1) (select "@ # 5 # @ I want to made bearing @ # 1 of 2 # @ # @ # @ hit, reimbursement, # # #@" as ABC) ret) test parse.vn) r as voice_num;Copy the code