Function: Regular match returns all substrings and returns array

Regexp_extract_all (Fields: STRING, Re: string, GROUP: int), Mandatory array: String, can be used to turn a line more lines of code address: https://github.com/leeshuaichao/hive_functions

Create an MVN project


      
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.moxi.hive</groupId>
    <artifactId>hive_udf</artifactId>
    <version>1.0 the SNAPSHOT</version>
    <packaging>jar</packaging>

    <name>HiveUDFs</name>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>3.1.2</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.8.0</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>
        </plugins>
    </build>
</project>
Copy the code

Write UDTF class

package com.moxi.hive.udf.regexp;

import com.moxi.hive.udf.utils.RegexpUtils;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.IntWritable;

Regexp_extract_all (field, re, returns the number of parentheses :0 is all) * regexp_extract_all(field, re, returns the number of parentheses :0 is all) *@author[email protected] * 2020/11/23 2:33 PM **/
public class UdtfRegexpExtractAll extends GenericUDF {
    @Override
    public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
        // Check if two arguments were passed
        if(objectInspectors.length ! =2&& objectInspectors.length ! =3) {
            throw new UDFArgumentLengthException(
                    "The function regexp_extract_all takes exactly 2 or 3 arguments.");
        }

        for (int i = 0; i < 2; i++) {
            if(! ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaStringObjectInspector, objectInspectors[i])) {throw new UDFArgumentTypeException(i,
                        "\" " + PrimitiveObjectInspectorFactory.javaStringObjectInspector.getTypeName() + "\" "
                                + "expected at function regexp_extract_all, but "
                                + "\" " + objectInspectors[i].getTypeName() + "\" "
                                + "is found"); }}if (objectInspectors.length == 3) {
            if(! ObjectInspectorUtils.compareTypes(PrimitiveObjectInspectorFactory.javaIntObjectInspector, objectInspectors[2]) {throw new UDFArgumentTypeException(2."\" " + PrimitiveObjectInspectorFactory.javaLongObjectInspector.getTypeName() + "\" "
                                + "expected at function regexp_extract_all, but "
                                + "\" " + objectInspectors[2].getTypeName() + "\" "
                                + "is found");
            }
        }

        ObjectInspector expect = PrimitiveObjectInspectorFactory.javaStringObjectInspector;

        return ObjectInspectorFactory.getStandardListObjectInspector(expect);
    }

    @Override
    public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
        String source = deferredObjects[0].get().toString();
        String pattern = deferredObjects[1].get().toString();
        Integer groupIndex = 0;
        if (deferredObjects.length == 3) {
            groupIndex = ((IntWritable) deferredObjects[2].get()).get();
        }

        if (source == null) {
            return null;
        }

        return RegexpUtils.findAll(pattern, source, groupIndex);
    }

    @Override
    public String getDisplayString(String[] strings) {
        assert (strings.length == 2 || strings.length == 3);
        if (strings.length == 2) {
            return "regexp_extract_all(" + strings[0] + ","
                    + strings[1] + ")";
        } else {
            return "regexp_extract_all(" + strings[0] + ","
                    + strings[1] + "," + strings[2] + ")"; }}}Copy the code

Regular utility class

package com.moxi.hive.udf.utils;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/** * Regular tools * [email protected] * 2020/11/23 2:45 PM **/
public class RegexpUtils {
    /** * query all substrings and return list *@paramRegex regular expression *@paramContent Is the recognized character string *@paramGroup gets the contents of the parentheses, with 0 being the entire re *@return* /
    public static List<String> findAll(String regex, CharSequence content, int group) {
        List<String> collection = new ArrayList<>();
        Pattern pattern = Pattern.compile(regex);
        if (null! = content) { Matcher matcher = pattern.matcher(content);while(matcher.find()) { collection.add(matcher.group(group)); }}returncollection; }}Copy the code

Package and upload to the server for testing

Pits encountered:

It has been a long time since Maven used jar packages to create regular jars. Maven has removed the huTool class because it requires special handling to type in dependencies

Create temporary functions

Add jar package to current windowAdd the jar/home/hive/apache - hive - 3.1.2 / lib/hive_udf - 1.0 - the SNAPSHOT. Jar;Create a temporary function
create temporary function regexp_extract_all AS 'com.moxi.hive.udf.regexp.UdtfRegexpExtractAll';
Copy the code

Test temporary functions

select voice_num from ( select regexp_extract_all(ret.abc, "@#(.*?) As.vn from # @ ", 1) (select "@ # 5 # @ I want to made bearing @ # 1 of 2 # @ # @ # @ hit, reimbursement, # # #@" as ABC) ret) test parse.vn) r as voice_num;Copy the code

Delete temporary functions

drop temporary function regexp_extract_all; Delete the jar/home/hive/apache - hive - 3.1.2 / lib/hive_udf - 1.0 - the SNAPSHOT. Jar;Copy the code

Generating permanent functions

Upload the JAR package to HDFS

# Create HDFS directory
hadoop fs -mkdir /lib
Add jar to HDFSHadoop fs -put /home/hive/apache-hive-3.1.2/lib/ hive_udf-1.0-snapshot.jar /lib/# check whether the database is successfully added
hadoop fs -lsr /lib
Copy the code

Creating a permanent function

create function data_mart.regexp_extract_all AS 'com.moxi.hive.udf.regexp.UdtfRegexpExtractAll' using jar 'HDFS: / lib/hive_udf - 1.0 - the SNAPSHOT. Jar'; create function data_center.regexp_extract_all AS 'com.moxi.hive.udf.regexp.UdtfRegexpExtractAll' using jar 'HDFS: / lib/hive_udf - 1.0 - the SNAPSHOT. Jar';Copy the code

test

select voice_num from ( select regexp_extract_all(ret.abc, "@#(.*?) As.vn from # @ ", 1) (select "@ # 5 # @ I want to made bearing @ # 1 of 2 # @ # @ # @ hit, reimbursement, # # #@" as ABC) ret) test parse.vn) r as voice_num;Copy the code