AbstractTestRegexpFunctions.java

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.presto.operator.scalar;

import com.facebook.presto.common.type.ArrayType;
import com.facebook.presto.common.type.StandardTypes;
import com.facebook.presto.spi.function.ScalarFunction;
import com.facebook.presto.spi.function.SqlType;
import com.facebook.presto.sql.analyzer.FunctionsConfig;
import com.facebook.presto.sql.analyzer.RegexLibrary;
import com.google.common.collect.ImmutableList;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;

import java.util.ArrayList;
import java.util.List;

import static com.facebook.presto.common.type.BooleanType.BOOLEAN;
import static com.facebook.presto.common.type.VarcharType.createUnboundedVarcharType;
import static com.facebook.presto.common.type.VarcharType.createVarcharType;
import static com.facebook.presto.spi.StandardErrorCode.INVALID_FUNCTION_ARGUMENT;

public abstract class AbstractTestRegexpFunctions
        extends AbstractTestFunctions
{
    AbstractTestRegexpFunctions(RegexLibrary regexLibrary)
    {
        super(new FunctionsConfig().setRegexLibrary(regexLibrary));
    }

    @BeforeClass
    public void setUp()
    {
        registerScalar(AbstractTestRegexpFunctions.class);
    }

    @ScalarFunction(deterministic = false) // if not non-deterministic, constant folding code accidentally fix invalid characters
    @SqlType(StandardTypes.VARCHAR)
    public static Slice invalidUtf8()
    {
        return Slices.wrappedBuffer(new byte[] {
                // AAA\uD800AAAA\uDFFFAAA, D800 and DFFF are valid unicode code points, but not valid UTF8
                (byte) 0x41, 0x41, (byte) 0xed, (byte) 0xa0, (byte) 0x80, 0x41, 0x41,
                0x41, 0x41, (byte) 0xed, (byte) 0xbf, (byte) 0xbf, 0x41, 0x41, 0x41,
        });
    }

    @Test
    public void testRegexpLike()
    {
        // Tests that REGEXP_LIKE doesn't loop infinitely on invalid UTF-8 input. Return value is irrelevant.
        functionAssertions.tryEvaluate("REGEXP_LIKE(invalid_utf8(), invalid_utf8())", BOOLEAN);

        assertFunction("REGEXP_LIKE('Stephen', 'Ste(v|ph)en')", BOOLEAN, true);
        assertFunction("REGEXP_LIKE('Stevens', 'Ste(v|ph)en')", BOOLEAN, true);
        assertFunction("REGEXP_LIKE('Stephen', '^Ste(v|ph)en$')", BOOLEAN, true);
        assertFunction("REGEXP_LIKE('Stevens', '^Ste(v|ph)en$')", BOOLEAN, false);

        assertFunction("REGEXP_LIKE('hello world', '[a-z]')", BOOLEAN, true);
        assertFunction("REGEXP_LIKE('hello\nworld', '.*hello\nworld.*')", BOOLEAN, true);
        assertFunction("REGEXP_LIKE('Hello', '^[a-z]+$')", BOOLEAN, false);
        assertFunction("REGEXP_LIKE('Hello', '^(?i)[a-z]+$')", BOOLEAN, true);
        assertFunction("REGEXP_LIKE('Hello', '^[a-zA-Z]+$')", BOOLEAN, true);

        // verify word boundaries at end of pattern (https://github.com/airlift/joni/pull/11)
        assertFunction("REGEXP_LIKE('test', 'test\\b')", BOOLEAN, true);
    }

    @Test
    public void testRegexCharLike()
    {
        assertFunction("REGEXP_LIKE('ala', CHAR 'ala  ')", BOOLEAN, false);
        assertFunction("REGEXP_LIKE('ala  ', CHAR 'ala  ')", BOOLEAN, true);
    }

    @Test
    public void testRegexpReplace()
    {
        assertFunction("REGEXP_REPLACE('fun stuff.', '[a-z]')", createVarcharType(10), " .");
        assertFunction("REGEXP_REPLACE('fun stuff.', '[a-z]', '*')", createVarcharType(65), "*** *****.");

        assertFunction(
                "REGEXP_REPLACE('call 555.123.4444 now', '(\\d{3})\\.(\\d{3}).(\\d{4})')",
                createVarcharType(21),
                "call  now");
        assertFunction(
                "REGEXP_REPLACE('call 555.123.4444 now', '(\\d{3})\\.(\\d{3}).(\\d{4})', '($1) $2-$3')",
                createVarcharType(2331),
                "call (555) 123-4444 now");

        assertFunction("REGEXP_REPLACE('xxx xxx xxx', 'x', 'x')", createVarcharType(71), "xxx xxx xxx");
        assertFunction("REGEXP_REPLACE('xxx xxx xxx', 'x', '\\x')", createVarcharType(143), "xxx xxx xxx");
        assertFunction("REGEXP_REPLACE('xxx', '', 'y')", createVarcharType(7), "yxyxyxy");
        assertInvalidFunction("REGEXP_REPLACE('xxx', 'x', '\\')", INVALID_FUNCTION_ARGUMENT);

        assertFunction("REGEXP_REPLACE('xxx xxx xxx', 'x', '$0')", createVarcharType(143), "xxx xxx xxx");
        assertFunction("REGEXP_REPLACE('xxx', '(x)', '$01')", createVarcharType(19), "xxx");
        assertFunction("REGEXP_REPLACE('xxx', 'x', '$05')", createVarcharType(19), "x5x5x5");
        assertFunction("REGEXP_REPLACE('123456789', '(1)(2)(3)(4)(5)(6)(7)(8)(9)', '$10')", createVarcharType(139), "10");
        assertFunction("REGEXP_REPLACE('1234567890', '(1)(2)(3)(4)(5)(6)(7)(8)(9)(0)', '$10')", createVarcharType(175), "0");
        assertFunction("REGEXP_REPLACE('1234567890', '(1)(2)(3)(4)(5)(6)(7)(8)(9)(0)', '$11')", createVarcharType(175), "11");
        assertFunction("REGEXP_REPLACE('1234567890', '(1)(2)(3)(4)(5)(6)(7)(8)(9)(0)', '$1a')", createVarcharType(175), "1a");
        assertInvalidFunction("REGEXP_REPLACE('xxx', 'x', '$1')", INVALID_FUNCTION_ARGUMENT);
        assertInvalidFunction("REGEXP_REPLACE('xxx', 'x', '$a')", INVALID_FUNCTION_ARGUMENT);
        assertInvalidFunction("REGEXP_REPLACE('xxx', 'x', '$')", INVALID_FUNCTION_ARGUMENT);

        assertFunction("REGEXP_REPLACE('wxyz', '(?<xyz>[xyz])', '${xyz}${xyz}')", createVarcharType(124), "wxxyyzz");
        assertFunction("REGEXP_REPLACE('wxyz', '(?<w>w)|(?<xyz>[xyz])', '[${w}](${xyz})')", createVarcharType(144), "[w]()[](x)[](y)[](z)");
        assertFunction("REGEXP_REPLACE('xyz', '(?<xyz>[xyz])+', '${xyz}')", createVarcharType(39), "z");
        assertFunction("REGEXP_REPLACE('xyz', '(?<xyz>[xyz]+)', '${xyz}')", createVarcharType(39), "xyz");
        assertInvalidFunction("REGEXP_REPLACE('xxx', '(?<name>x)', '${}')", INVALID_FUNCTION_ARGUMENT);
        assertInvalidFunction("REGEXP_REPLACE('xxx', '(?<name>x)', '${0}')", INVALID_FUNCTION_ARGUMENT);
        assertInvalidFunction("REGEXP_REPLACE('xxx', '(?<name>x)', '${nam}')", INVALID_FUNCTION_ARGUMENT);

        assertFunction("REGEXP_REPLACE(VARCHAR 'x', '.*', 'xxxxx')", createUnboundedVarcharType(), "xxxxxxxxxx");
    }

    @Test
    public void testRegexpReplaceLambda()
    {
        // One or more matches with non-empty, not null capturing groups
        assertFunction("REGEXP_REPLACE('x', '(x)', x -> upper(x[1]))", createUnboundedVarcharType(), "X");
        assertFunction("REGEXP_REPLACE('xxx xxx xxx', '(x)', x -> upper(x[1]))", createUnboundedVarcharType(), "XXX XXX XXX");
        assertFunction("REGEXP_REPLACE('new', '(\\w)', x -> x[1])", createUnboundedVarcharType(), "new");
        assertFunction("REGEXP_REPLACE('new', '(\\w)', x -> x[1] || upper(x[1]))", createUnboundedVarcharType(), "nNeEwW");
        assertFunction("REGEXP_REPLACE('new york', '(\\w)(\\w*)', x -> upper(x[1]) || lower(x[2]))", createUnboundedVarcharType(), "New York");
        assertFunction("REGEXP_REPLACE('new york', '((\\w)(\\w*))', x -> upper(x[2]) || lower(x[3]))", createUnboundedVarcharType(), "New York");
        assertFunction("REGEXP_REPLACE('new york', '(n\\w*)', x -> upper(x[1]))", createUnboundedVarcharType(), "NEW york");
        assertFunction("REGEXP_REPLACE('new york', '(y\\w*)', x -> upper(x[1]))", createUnboundedVarcharType(), "new YORK");
        assertFunction("REGEXP_REPLACE('new york city', '(yo\\w*)', x -> upper(x[1]))", createUnboundedVarcharType(), "new YORK city");
        assertFunction("REGEXP_REPLACE('abc abc', '(abc)', x -> 'm')", createUnboundedVarcharType(), "m m");
        assertFunction("REGEXP_REPLACE('123 456', '([0-9]*)', x -> x[1])", createUnboundedVarcharType(), "123 456");
        assertFunction("REGEXP_REPLACE('123 456', '(([0-9]*) ([0-9]*))', x -> x[2] || x[3])", createUnboundedVarcharType(), "123456");
        assertFunction("REGEXP_REPLACE('abbabba', '(abba)', x -> 'm')", createUnboundedVarcharType(), "mbba");
        assertFunction("REGEXP_REPLACE('abbabba', '(abba)', x -> 'm' || x[1])", createUnboundedVarcharType(), "mabbabba");
        assertFunction("REGEXP_REPLACE('abcde', 'ab(c)?de', x -> CASE WHEN x[1] IS NULL THEN 'foo' ELSE 'bar' END)", createUnboundedVarcharType(), "bar");
        assertFunction("REGEXP_REPLACE('abc', '(.)', x -> 'm')", createUnboundedVarcharType(), "mmm");

        // Matches that contains empty capturing groups
        assertFunction("REGEXP_REPLACE('abc', '.', x -> 'm')", createUnboundedVarcharType(), "mmm");  // Empty block passed to lambda
        assertFunction("REGEXP_REPLACE('abbabba', 'abba', x -> 'm')", createUnboundedVarcharType(), "mbba"); // Empty block passed to lambda
        assertFunction("REGEXP_REPLACE('abc abc', 'abc', x -> 'm')", createUnboundedVarcharType(), "m m"); // Empty block passed to lambda
        assertFunction("REGEXP_REPLACE('abc', '', x -> 'OK')", createUnboundedVarcharType(), "OKaOKbOKcOK");  // Empty block passed to lambda
        assertFunction("REGEXP_REPLACE('abc', '()', x -> x[1])", createUnboundedVarcharType(), "abc"); // Passed a block containing multiple empty capturing groups to lambda
        assertFunction("REGEXP_REPLACE('abc', '()', x -> 'OK')", createUnboundedVarcharType(), "OKaOKbOKcOK"); // Passed a block containing multiple empty capturing groups to lambda
        assertFunction("REGEXP_REPLACE('new', '(\\w*)', x -> upper(x[1]))", createUnboundedVarcharType(), "NEW");  // Two matches: ["new"] and [""]
        assertFunction("REGEXP_REPLACE('new', '(\\w*)', x -> x[1] || upper(x[1]))", createUnboundedVarcharType(), "newNEW");  // Two matches: ["new"] and [""]
        assertFunction("REGEXP_REPLACE('new', '(\\w*)', x -> CAST(length(x[1]) AS VARCHAR))", createUnboundedVarcharType(), "30");  // Two matches: ["new"] and [""]
        assertFunction("REGEXP_REPLACE('new york', '(\\w*)', x -> '<' || x[1] || '>')", createUnboundedVarcharType(), "<new><> <york><>");  // Four matches: ["new"], [""], ["york"], [""]

        // Matches that contains null capturing groups
        assertFunction("REGEXP_REPLACE('aaa', '(b)?', x -> x[1] )", createUnboundedVarcharType(), null);
        assertFunction("REGEXP_REPLACE('abde', 'ab(c)?de', x -> x[1])", createUnboundedVarcharType(), null);  // Matched once with two matching groups[(0,3), null].
        assertFunction("REGEXP_REPLACE('abde', 'ab(c)?de', x -> 'OK')", createUnboundedVarcharType(), "OK");  // Matched once with two matching groups[(0,3), null]. Passed null to lambda and returns OK, so whole string replace with OK
        assertFunction("REGEXP_REPLACE('abde', 'ab(c)?de', x -> x[1] || 'OK')", createUnboundedVarcharType(), null); // Passed null to lambda and returns null.
        assertFunction("REGEXP_REPLACE('abde', 'ab(c)?de', x -> 'OK' || x[1])", createUnboundedVarcharType(), null);  // Passed null to lambda and returns null.
        assertFunction("REGEXP_REPLACE('abde', 'ab(c)?de', x -> CASE WHEN x[1] IS NULL THEN 'foo' ELSE 'bar' END)", createUnboundedVarcharType(), "foo");
        assertFunction("REGEXP_REPLACE('ab', '(a)?(b)?', x -> CASE WHEN (x[1] IS NOT NULL) AND (x[2] IS NOT NULL) THEN 'foo' ELSE NULL END)", createUnboundedVarcharType(), null);

        // Matches that contains non-empty and not null capturing groups but lambda returns null
        assertFunction("REGEXP_REPLACE('aaa', '(a)', x -> CAST(NULL AS VARCHAR))", createUnboundedVarcharType(), null);
        assertFunction("REGEXP_REPLACE('ab', '(a)?(b)?', x -> CASE WHEN (x[1] IS NOT NULL) AND (x[2] IS NULL) OR (x[1] IS NULL) AND (x[2] IS NOT NULL) THEN 'foo' ELSE NULL END)", createUnboundedVarcharType(), null);
        assertFunction("REGEXP_REPLACE('abacdb', '(a)?(b)?', x -> CASE WHEN (x[1] IS NOT NULL) AND (x[2] IS NULL) OR (x[1] IS NULL) AND (x[2] IS NOT NULL) THEN 'foo' ELSE NULL END)", createUnboundedVarcharType(), null);

        // No matches
        assertFunction("REGEXP_REPLACE('new york', '(a)', x -> upper(x[1]))", createUnboundedVarcharType(), "new york");
        assertFunction("REGEXP_REPLACE('', '(a)', x -> upper(x[1]))", createUnboundedVarcharType(), "");
        assertFunction("REGEXP_REPLACE(null, '(a)', x -> upper(x[1]))", createUnboundedVarcharType(), null);
        assertFunction("REGEXP_REPLACE('new', null, x -> upper(x[1]))", createUnboundedVarcharType(), null);
        assertFunction("REGEXP_REPLACE('abde', '(c)', x -> x[1])", createUnboundedVarcharType(), "abde");
        assertFunction("REGEXP_REPLACE('abde', '(c)', x -> 'm')", createUnboundedVarcharType(), "abde");

        // Invalid array indexes
        assertInvalidFunction("REGEXP_REPLACE('new', '(\\w)', x -> upper(x[2]))", INVALID_FUNCTION_ARGUMENT);
        assertInvalidFunction("REGEXP_REPLACE('new', '(\\w)', x -> upper(x[0]))", INVALID_FUNCTION_ARGUMENT);
        assertInvalidFunction("REGEXP_REPLACE('abc', '', x -> x[1])", INVALID_FUNCTION_ARGUMENT);
        assertInvalidFunction("REGEXP_REPLACE('x', 'x', x -> upper(x[1]))", INVALID_FUNCTION_ARGUMENT);  // Empty block passed to lambda but referencing an element out of bound
        assertInvalidFunction("REGEXP_REPLACE('abbabba', 'abba', x -> 'm' || x[1])", INVALID_FUNCTION_ARGUMENT);  // Empty block passed to lambda but referencing an element out of bound
    }

    @Test
    public void testRegexpExtract()
    {
        assertFunction("REGEXP_EXTRACT('Hello world bye', '\\b[a-z]([a-z]*)')", createVarcharType(15), "world");
        assertFunction("REGEXP_EXTRACT('Hello world bye', '\\b[a-z]([a-z]*)', 1)", createVarcharType(15), "orld");
        assertFunction("REGEXP_EXTRACT('rat cat\nbat dog', 'ra(.)|blah(.)(.)', 2)", createVarcharType(15), null);
        assertFunction("REGEXP_EXTRACT('12345', 'x')", createVarcharType(5), null);
        assertFunction("REGEXP_EXTRACT('Baby X', 'by ([A-Z].*)\\b[a-z]')", createVarcharType(6), null);

        assertInvalidFunction("REGEXP_EXTRACT('Hello world bye', '\\b[a-z]([a-z]*)', -1)", INVALID_FUNCTION_ARGUMENT);
        assertInvalidFunction("REGEXP_EXTRACT('Hello world bye', '\\b[a-z]([a-z]*)', 2)", INVALID_FUNCTION_ARGUMENT);
    }

    @Test
    public void testRegexpExtractAll()
    {
        assertFunction("REGEXP_EXTRACT_ALL('rat cat\nbat dog', '.at')", new ArrayType(createVarcharType(15)), ImmutableList.of("rat", "cat", "bat"));
        assertFunction("REGEXP_EXTRACT_ALL('rat cat\nbat dog', '(.)at', 1)", new ArrayType(createVarcharType(15)), ImmutableList.of("r", "c", "b"));
        List<String> nullList = new ArrayList<>();
        nullList.add(null);
        assertFunction("REGEXP_EXTRACT_ALL('rat cat\nbat dog', 'ra(.)|blah(.)(.)', 2)", new ArrayType(createVarcharType(15)), nullList);
        assertInvalidFunction("REGEXP_EXTRACT_ALL('hello', '(.)', 2)", "Pattern has 1 groups. Cannot access group 2");

        assertFunction("REGEXP_EXTRACT_ALL('12345', '')", new ArrayType(createVarcharType(5)), ImmutableList.of("", "", "", "", "", ""));
        assertInvalidFunction("REGEXP_EXTRACT_ALL('12345', '(')", INVALID_FUNCTION_ARGUMENT);
    }

    @Test
    public void testRegexpSplit()
    {
        assertFunction("REGEXP_SPLIT('a.b:c;d', '[\\.:;]')", new ArrayType(createVarcharType(7)), ImmutableList.of("a", "b", "c", "d"));
        assertFunction("REGEXP_SPLIT('a.b:c;d', '\\.')", new ArrayType(createVarcharType(7)), ImmutableList.of("a", "b:c;d"));
        assertFunction("REGEXP_SPLIT('a.b:c;d', ':')", new ArrayType(createVarcharType(7)), ImmutableList.of("a.b", "c;d"));
        assertFunction("REGEXP_SPLIT('a,b,c', ',')", new ArrayType(createVarcharType(5)), ImmutableList.of("a", "b", "c"));
        assertFunction("REGEXP_SPLIT('a1b2c3d', '\\d')", new ArrayType(createVarcharType(7)), ImmutableList.of("a", "b", "c", "d"));
        assertFunction("REGEXP_SPLIT('a1b2346c3d', '\\d+')", new ArrayType(createVarcharType(10)), ImmutableList.of("a", "b", "c", "d"));
        assertFunction("REGEXP_SPLIT('abcd', 'x')", new ArrayType(createVarcharType(4)), ImmutableList.of("abcd"));
        assertFunction("REGEXP_SPLIT('abcd', '')", new ArrayType(createVarcharType(4)), ImmutableList.of("", "a", "b", "c", "d", ""));
        assertFunction("REGEXP_SPLIT('', 'x')", new ArrayType(createVarcharType(0)), ImmutableList.of(""));

        // test empty splits, leading & trailing empty splits, consecutive empty splits
        assertFunction("REGEXP_SPLIT('a,b,c,d', ',')", new ArrayType(createVarcharType(7)), ImmutableList.of("a", "b", "c", "d"));
        assertFunction("REGEXP_SPLIT(',,a,,,b,c,d,,', ',')", new ArrayType(createVarcharType(13)), ImmutableList.of("", "", "a", "", "", "b", "c", "d", "", ""));
        assertFunction("REGEXP_SPLIT(',,,', ',')", new ArrayType(createVarcharType(3)), ImmutableList.of("", "", "", ""));
    }
}