/src/libreoffice/i18npool/source/search/textsearch.cxx

Source
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 *
 * This file incorporates work covered by the following license notice:
 *
 *   Licensed to the Apache Software Foundation (ASF) under one or more
 *   contributor license agreements. See the NOTICE file distributed
 *   with this work for additional information regarding copyright
 *   ownership. The ASF licenses this file to you under the Apache
 *   License, Version 2.0 (the "License"); you may not use this file
 *   except in compliance with the License. You may obtain a copy of
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
 */

#include "textsearch.hxx"
#include "levdis.hxx"
#include <com/sun/star/i18n/BreakIterator.hpp>
#include <com/sun/star/util/SearchAlgorithms2.hpp>
#include <com/sun/star/util/SearchFlags.hpp>
#include <com/sun/star/i18n/WordType.hpp>
#include <com/sun/star/i18n/ScriptType.hpp>
#include <com/sun/star/i18n/CharacterIteratorMode.hpp>
#include <com/sun/star/i18n/CharacterClassification.hpp>
#include <com/sun/star/i18n/KCharacterType.hpp>
#include <com/sun/star/i18n/Transliteration.hpp>
#include <cppuhelper/supportsservice.hxx>
#include <cppuhelper/weak.hxx>
#include <i18nutil/transliteration.hxx>
#include <rtl/ustrbuf.hxx>
#include <sal/log.hxx>

#include <unicode/regex.h>

using namespace ::com::sun::star::util;
using namespace ::com::sun::star::uno;
using namespace ::com::sun::star::lang;
using namespace ::com::sun::star::i18n;
using namespace ::com::sun::star;

const TransliterationFlags COMPLEX_TRANS_MASK =
    TransliterationFlags::ignoreBaFa_ja_JP |
    TransliterationFlags::ignoreIterationMark_ja_JP |
    TransliterationFlags::ignoreTiJi_ja_JP |
    TransliterationFlags::ignoreHyuByu_ja_JP |
    TransliterationFlags::ignoreSeZe_ja_JP |
    TransliterationFlags::ignoreIandEfollowedByYa_ja_JP |
    TransliterationFlags::ignoreKiKuFollowedBySa_ja_JP |
    TransliterationFlags::ignoreProlongedSoundMark_ja_JP;

namespace
{
TransliterationFlags maskComplexTrans( TransliterationFlags n )
{
    // IGNORE_KANA and FULLWIDTH_HALFWIDTH are simple but need to take effect
    // in complex transliteration.
    return
        n & (COMPLEX_TRANS_MASK |                       // all set ignore bits
        TransliterationFlags::IGNORE_KANA |            // plus IGNORE_KANA bit
        TransliterationFlags::FULLWIDTH_HALFWIDTH);    // and the FULLWIDTH_HALFWIDTH value
}

bool isComplexTrans( TransliterationFlags n )
{
    return bool(n & COMPLEX_TRANS_MASK);
}

TransliterationFlags maskSimpleTrans( TransliterationFlags n )
{
    return n & ~COMPLEX_TRANS_MASK;
}

bool isSimpleTrans( TransliterationFlags n )
{
    return bool(maskSimpleTrans(n));
}

// Regex patterns are case sensitive.
TransliterationFlags maskSimpleRegexTrans( TransliterationFlags n )
{
    TransliterationFlags m = (n & TransliterationFlags::IGNORE_MASK) & ~TransliterationFlags::IGNORE_CASE;
    TransliterationFlags v = n & TransliterationFlags::NON_IGNORE_MASK;
    if (v == TransliterationFlags::UPPERCASE_LOWERCASE || v == TransliterationFlags::LOWERCASE_UPPERCASE)
        v = TransliterationFlags::NONE;
    return (m | v) & ~COMPLEX_TRANS_MASK;
}

bool isSimpleRegexTrans( TransliterationFlags n )
{
    return bool(maskSimpleRegexTrans(n));
}

bool isReplacePunctuation( std::u16string_view rStr )
{
    return rStr.find(u'\u2018') != std::u16string_view::npos ||
           rStr.find(u'\u2019') != std::u16string_view::npos ||
           rStr.find(u'\u201A') != std::u16string_view::npos ||
           rStr.find(u'\u201B') != std::u16string_view::npos ||
           rStr.find(u'\u201C') != std::u16string_view::npos ||
           rStr.find(u'\u201D') != std::u16string_view::npos ||
           rStr.find(u'\u201E') != std::u16string_view::npos ||
           rStr.find(u'\u201F') != std::u16string_view::npos;
}

OUString replacePunctuation( const OUString &rStr )
{
    return rStr.replace(u'\u2018', '\'')
               .replace(u'\u2019', '\'')
               .replace(u'\u201A', '\'')
               .replace(u'\u201B', '\'')
               .replace(u'\u201C', '"')
               .replace(u'\u201D', '"')
               .replace(u'\u201E', '"')
               .replace(u'\u201F', '"');
}
};

TextSearch::TextSearch(const Reference < XComponentContext > & rxContext)
        : m_xContext( rxContext )
{
    SearchOptions2 aOpt;
    aOpt.AlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
    aOpt.algorithmType = SearchAlgorithms_ABSOLUTE;
    aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE;
    //aOpt.Locale = ???;
    setOptions( aOpt );
}

TextSearch::~TextSearch()
{
    pRegexMatcher.reset();
    pWLD.reset();
    pJumpTable.reset();
    pJumpTable2.reset();
}

void TextSearch::setOptions2( const SearchOptions2& rOptions )
{
    std::unique_lock g(m_aMutex);

    aSrchPara = rOptions;

    pRegexMatcher.reset();
    pWLD.reset();
    pJumpTable.reset();
    pJumpTable2.reset();
    maWildcardReversePattern.clear();
    maWildcardReversePattern2.clear();
    TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(aSrchPara.transliterateFlags);
    bSearchApostrophe = false;
    bool bReplaceApostrophe = false;
    if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
    {
        // RESrchPrepare will consider aSrchPara.transliterateFlags when
        // picking the actual regex pattern
        // (sSrchStr|sSrchStr2|SearchOptions2::searchString) and setting
        // case-insensitivity. Create transliteration instance, if any, without
        // ignore-case so later in TextSearch::searchForward() the string to
        // match is not case-altered, leave case-(in)sensitive to regex engine.
        transliterateFlags &= ~TransliterationFlags::IGNORE_CASE;
    }
    else if ( aSrchPara.searchString.indexOf('\'') > - 1 || aSrchPara.searchString.indexOf('"') > - 1 )
    {
        bSearchApostrophe = true;
        bReplaceApostrophe = isReplacePunctuation(aSrchPara.searchString);
    }

    // Create Transliteration class
    if( isSimpleTrans( transliterateFlags) )
    {
        if( !xTranslit.is() )
            xTranslit.set( Transliteration::create( m_xContext ) );
        xTranslit->loadModule(
             static_cast<TransliterationModules>(maskSimpleTrans(transliterateFlags)),
             aSrchPara.Locale);
    }
    else if( xTranslit.is() )
        xTranslit = nullptr;

    // Create Transliteration for 2<->1, 2<->2 transliteration
    if ( isComplexTrans( transliterateFlags) )
    {
        if( !xTranslit2.is() )
            xTranslit2.set( Transliteration::create( m_xContext ) );
        // Load transliteration module
        xTranslit2->loadModule(
             static_cast<TransliterationModules>(maskComplexTrans(transliterateFlags)),
             aSrchPara.Locale);
    }

    if ( !xBreak.is() )
        xBreak = css::i18n::BreakIterator::create( m_xContext );

    sSrchStr = aSrchPara.searchString;

    // Transliterate search string.
    if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP)
    {
        if (isSimpleRegexTrans(transliterateFlags))
        {
            if (maskSimpleRegexTrans(transliterateFlags) !=
                    maskSimpleTrans(transliterateFlags))
            {
                css::uno::Reference< XExtendedTransliteration > xTranslitPattern(
                         Transliteration::create( m_xContext ));
                if (xTranslitPattern.is())
                {
                    xTranslitPattern->loadModule(
                            static_cast<TransliterationModules>(maskSimpleRegexTrans(transliterateFlags)),
                            aSrchPara.Locale);
                    sSrchStr = xTranslitPattern->transliterateString2String(
                            aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
                }
            }
            else
            {
                if (xTranslit.is())
                    sSrchStr = xTranslit->transliterateString2String(
                            aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
            }
            // xTranslit2 complex transliterated sSrchStr2 is not used in
            // regex, see TextSearch::searchForward() and
            // TextSearch::searchBackward()
        }
    }
    else
    {
        if ( xTranslit.is() && isSimpleTrans(transliterateFlags) )
            sSrchStr = xTranslit->transliterateString2String(
                    aSrchPara.searchString, 0, aSrchPara.searchString.getLength());

        if ( xTranslit2.is() && isComplexTrans(transliterateFlags) )
            sSrchStr2 = xTranslit2->transliterateString2String(
                    aSrchPara.searchString, 0, aSrchPara.searchString.getLength());
    }

    if ( bReplaceApostrophe )
        sSrchStr = replacePunctuation(sSrchStr);

    // Take the new SearchOptions2::AlgorithmType2 field and ignore
    // SearchOptions::algorithmType
    switch( aSrchPara.AlgorithmType2)
    {
        case SearchAlgorithms2::REGEXP:
            fnForward = &TextSearch::RESrchFrwrd;
            fnBackward = &TextSearch::RESrchBkwrd;
            RESrchPrepare( aSrchPara);
            break;

        case SearchAlgorithms2::APPROXIMATE:
            fnForward = &TextSearch::ApproxSrchFrwrd;
            fnBackward = &TextSearch::ApproxSrchBkwrd;

            pWLD.reset( new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars,
                    aSrchPara.insertedChars, aSrchPara.deletedChars,
                    0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ) );

            nLimit = pWLD->GetLimit();
            break;

        case SearchAlgorithms2::WILDCARD:
            mcWildcardEscapeChar = static_cast<sal_uInt32>(aSrchPara.WildcardEscapeCharacter);
            mbWildcardAllowSubstring = ((aSrchPara.searchFlag & SearchFlags::WILD_MATCH_SELECTION) == 0);
            fnForward = &TextSearch::WildcardSrchFrwrd;
            fnBackward = &TextSearch::WildcardSrchBkwrd;
            break;

        default:
            SAL_WARN("i18npool","TextSearch::setOptions2 - default what?");
            [[fallthrough]];
        case SearchAlgorithms2::ABSOLUTE:
            fnForward = &TextSearch::NSrchFrwrd;
            fnBackward = &TextSearch::NSrchBkwrd;
            break;
    }
}

void TextSearch::setOptions( const SearchOptions& rOptions )
{
    sal_Int16 nAlgorithmType2;
    switch (rOptions.algorithmType)
    {
        case SearchAlgorithms_REGEXP:
            nAlgorithmType2 = SearchAlgorithms2::REGEXP;
            break;
        case SearchAlgorithms_APPROXIMATE:
            nAlgorithmType2 = SearchAlgorithms2::APPROXIMATE;
            break;
        default:
            SAL_WARN("i18npool","TextSearch::setOptions - default what?");
            [[fallthrough]];
        case SearchAlgorithms_ABSOLUTE:
            nAlgorithmType2 = SearchAlgorithms2::ABSOLUTE;
            break;
    }
    // It would be nice if an inherited struct had a ctor that takes an
    // instance of the object the struct derived from...
    SearchOptions2 aOptions2(
            rOptions.algorithmType,
            rOptions.searchFlag,
            rOptions.searchString,
            rOptions.replaceString,
            rOptions.Locale,
            rOptions.changedChars,
            rOptions.deletedChars,
            rOptions.insertedChars,
            rOptions.transliterateFlags,
            nAlgorithmType2,
            0   // no wildcard search, no escape character...
            );
    setOptions2( aOptions2);
}

static sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos )
{
    auto pOff = std::find_if(rOff.begin(), rOff.end(),
        [nPos](const sal_Int32 nOff) { return nOff >= nPos; });
    return static_cast<sal_Int32>(std::distance(rOff.begin(), pOff));
}

SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
{
    std::unique_lock g(m_aMutex);

    SearchResult sres;

    OUString in_str(searchStr);

    // in non-regex mode, allow searching typographical apostrophe with the ASCII one
    // to avoid regression after using automatic conversion to U+2019 during typing in Writer
    bool bReplaceApostrophe = bSearchApostrophe && isReplacePunctuation(in_str);

    bUsePrimarySrchStr = true;

    if ( xTranslit.is() )
    {
        // apply normal transliteration (1<->1, 1<->0)

        sal_Int32 nInStartPos = startPos;
        if (pRegexMatcher && startPos > 0)
        {
            // tdf#89665, tdf#75806: An optimization to avoid transliterating the whole string, yet
            // transliterate enough of the leading text to allow sensible look-behind assertions.
            // 100 is chosen arbitrarily in the hope that look-behind assertions would largely fit.
            // See http://userguide.icu-project.org/strings/regexp for look-behind assertion syntax.
            // When search regex doesn't start with an assertion, 3 is to allow startPos to be in
            // the middle of a surrogate pair, preceded by another surrogate pair.
            const sal_Int32 nMaxLeadingLen = aSrchPara.searchString.startsWith("(?") ? 100 : 3;
            nInStartPos -= std::min(nMaxLeadingLen, startPos);
        }
        sal_Int32 nInEndPos = endPos;
        if (pRegexMatcher && endPos < searchStr.getLength())
        {
            // tdf#65038: ditto for look-ahead assertions
            const sal_Int32 nMaxTrailingLen = aSrchPara.searchString.endsWith(")") ? 100 : 3;
            nInEndPos += std::min(nMaxTrailingLen, searchStr.getLength() - endPos);
        }

        css::uno::Sequence<sal_Int32> offset(nInEndPos - nInStartPos);
        in_str = xTranslit->transliterate(searchStr, nInStartPos, nInEndPos - nInStartPos, offset);

        if ( bReplaceApostrophe )
            in_str = replacePunctuation(in_str);

        // JP 20.6.2001: also the start and end positions must be corrected!
        sal_Int32 newStartPos =
            (startPos == 0) ? 0 : FindPosInSeq_Impl( offset, startPos );

        sal_Int32 newEndPos = (endPos < searchStr.getLength())
            ? FindPosInSeq_Impl( offset, endPos )
            : in_str.getLength();

        sres = (this->*fnForward)( g, in_str, newStartPos, newEndPos );

        // Map offsets back to untransliterated string.
        const sal_Int32 nOffsets = offset.getLength();
        if (nOffsets)
        {
            auto sres_startOffsetRange = asNonConstRange(sres.startOffset);
            auto sres_endOffsetRange = asNonConstRange(sres.endOffset);
            // For regex nGroups is the number of groups+1 with group 0 being
            // the entire match.
            const sal_Int32 nGroups = sres.startOffset.getLength();
            for ( sal_Int32 k = 0; k < nGroups; k++ )
            {
                const sal_Int32 nStart = sres.startOffset[k];
                // Result offsets are negative (-1) if a group expression was
                // not matched.
                if (nStart >= 0)
                    sres_startOffsetRange[k] = (nStart < nOffsets ? offset[nStart] : (offset[nOffsets - 1] + 1));
                // JP 20.6.2001: end is ever exclusive and then don't return
                //               the position of the next character - return the
                //               next position behind the last found character!
                //               "a b c" find "b" must return 2,3 and not 2,4!!!
                const sal_Int32 nStop = sres.endOffset[k];
                if (nStop >= 0)
                {
                    if (nStop > 0)
                        sres_endOffsetRange[k] = offset[(nStop <= nOffsets ? nStop : nOffsets) - 1] + 1;
                    else
                        sres_endOffsetRange[k] = offset[0];
                }
            }
        }
    }
    else
    {
        if ( bReplaceApostrophe )
            in_str = in_str.replace(u'\u2019', '\'');

        sres = (this->*fnForward)( g, in_str, startPos, endPos );
    }

    if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP)
    {
        SearchResult sres2;

        in_str = searchStr;
        css::uno::Sequence <sal_Int32> offset( in_str.getLength());

        in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset );

        if( startPos )
            startPos = FindPosInSeq_Impl( offset, startPos );

        if( endPos < searchStr.getLength() )
            endPos = FindPosInSeq_Impl( offset, endPos );
        else
            endPos = in_str.getLength();

        bUsePrimarySrchStr = false;
        sres2 = (this->*fnForward)( g, in_str, startPos, endPos );
        auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);
        auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);

        for ( int k = 0; k < sres2.startOffset.getLength(); k++ )
        {
            if (sres2.startOffset[k])
                sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1] + 1;
            if (sres2.endOffset[k])
                sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1] + 1;
        }

        // pick first and long one
        if ( sres.subRegExpressions == 0)
            return sres2;
        if ( sres2.subRegExpressions == 1)
        {
            if ( sres.startOffset[0] > sres2.startOffset[0])
                return sres2;
            else if ( sres.startOffset[0] == sres2.startOffset[0] &&
                    sres.endOffset[0] < sres2.endOffset[0])
                return sres2;
        }
    }

    return sres;
}

SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
{
    std::unique_lock g(m_aMutex);

    SearchResult sres;

    OUString in_str(searchStr);

    // in non-regex mode, allow searching typographical apostrophe with the ASCII one
    // to avoid regression after using automatic conversion to U+2019 during typing in Writer
    bool bReplaceApostrophe = bSearchApostrophe && isReplacePunctuation(in_str);

    bUsePrimarySrchStr = true;

    if ( xTranslit.is() )
    {
        // apply only simple 1<->1 transliteration here
        css::uno::Sequence<sal_Int32> offset(startPos - endPos);
        in_str = xTranslit->transliterate( searchStr, endPos, startPos - endPos, offset );

        if ( bReplaceApostrophe )
            in_str = replacePunctuation(in_str);

        // JP 20.6.2001: also the start and end positions must be corrected!
        sal_Int32 const newStartPos = (startPos < searchStr.getLength())
            ? FindPosInSeq_Impl( offset, startPos )
            : in_str.getLength();

        sal_Int32 const newEndPos =
            (endPos == 0) ? 0 : FindPosInSeq_Impl( offset, endPos );

        // TODO: this would need nExtraOffset handling to avoid $ matching
        // if (pRegexMatcher && startPos < searchStr.getLength())
        // but that appears to be impossible with ICU regex

        sres = (this->*fnBackward)( g, in_str, newStartPos, newEndPos );

        // Map offsets back to untransliterated string.
        const sal_Int32 nOffsets = offset.getLength();
        if (nOffsets)
        {
            auto sres_startOffsetRange = asNonConstRange(sres.startOffset);
            auto sres_endOffsetRange = asNonConstRange(sres.endOffset);
            // For regex nGroups is the number of groups+1 with group 0 being
            // the entire match.
            const sal_Int32 nGroups = sres.startOffset.getLength();
            for ( sal_Int32 k = 0; k < nGroups; k++ )
            {
                const sal_Int32 nStart = sres.startOffset[k];
                // Result offsets are negative (-1) if a group expression was
                // not matched.
                if (nStart >= 0)
                {
                    if (nStart > 0)
                        sres_startOffsetRange[k] = offset[(nStart <= nOffsets ? nStart : nOffsets) - 1] + 1;
                    else
                        sres_startOffsetRange[k] = offset[0];
                }
                // JP 20.6.2001: end is ever exclusive and then don't return
                //               the position of the next character - return the
                //               next position behind the last found character!
                //               "a b c" find "b" must return 2,3 and not 2,4!!!
                const sal_Int32 nStop = sres.endOffset[k];
                if (nStop >= 0)
                    sres_endOffsetRange[k] = (nStop < nOffsets ? offset[nStop] : (offset[nOffsets - 1] + 1));
            }
        }
    }
    else
    {
        if ( bReplaceApostrophe )
            in_str = replacePunctuation(in_str);

        sres = (this->*fnBackward)( g, in_str, startPos, endPos );
    }

    if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP )
    {
        SearchResult sres2;

        in_str = searchStr;
        css::uno::Sequence <sal_Int32> offset( in_str.getLength());

        in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset);

        if( startPos < searchStr.getLength() )
            startPos = FindPosInSeq_Impl( offset, startPos );
        else
            startPos = in_str.getLength();

        if( endPos )
            endPos = FindPosInSeq_Impl( offset, endPos );

        bUsePrimarySrchStr = false;
        sres2 = (this->*fnBackward)( g, in_str, startPos, endPos );
        auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset);
        auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset);

        for( int k = 0; k < sres2.startOffset.getLength(); k++ )
        {
            if (sres2.startOffset[k])
                sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1]+1;
            if (sres2.endOffset[k])
                sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1]+1;
        }

        // pick last and long one
        if ( sres.subRegExpressions == 0 )
            return sres2;
        if ( sres2.subRegExpressions == 1 )
        {
            if ( sres.startOffset[0] < sres2.startOffset[0] )
                return sres2;
            if ( sres.startOffset[0] == sres2.startOffset[0] &&
                    sres.endOffset[0] > sres2.endOffset[0] )
                return sres2;
        }
    }

    return sres;
}


bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const
{
    bool bRet = true;
    if( '\x7f' != rStr[nPos])
    {
        if ( !xCharClass.is() )
             xCharClass = CharacterClassification::create( m_xContext );
        sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos,
                aSrchPara.Locale );
        if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA |
                        KCharacterType::LETTER ) & nCType ) )
            bRet = false;
    }
    return bRet;
}

// --------- helper methods for Boyer-Moore like text searching ----------
// TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available

void TextSearch::MakeForwardTab()
{
    // create the jumptable for the search text

    if( pJumpTable && bIsForwardTab )
    {
        return; // the jumpTable is ok
    }
    bIsForwardTab = true;

    sal_Int32 n, nLen = sSrchStr.getLength();
    pJumpTable.reset( new TextSearchJumpTable );

    for( n = 0; n < nLen - 1; ++n )
    {
        sal_Unicode cCh = sSrchStr[n];
        sal_Int32 nDiff = nLen - n - 1;
        TextSearchJumpTable::value_type aEntry( cCh, nDiff );

        ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
            pJumpTable->insert( aEntry );
        if ( !aPair.second )
            (*(aPair.first)).second = nDiff;
    }
}

void TextSearch::MakeForwardTab2()
{
    // create the jumptable for the search text
    if( pJumpTable2 && bIsForwardTab )
    {
        return;        // the jumpTable is ok
    }
    bIsForwardTab = true;

    sal_Int32 n, nLen = sSrchStr2.getLength();
    pJumpTable2.reset( new TextSearchJumpTable );

    for( n = 0; n < nLen - 1; ++n )
    {
        sal_Unicode cCh = sSrchStr2[n];
        sal_Int32 nDiff = nLen - n - 1;

        TextSearchJumpTable::value_type aEntry( cCh, nDiff );
        ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
            pJumpTable2->insert( aEntry );
        if ( !aPair.second )
            (*(aPair.first)).second = nDiff;
    }
}

void TextSearch::MakeBackwardTab()
{
    // create the jumptable for the search text
    if( pJumpTable && !bIsForwardTab)
    {
        return;   // the jumpTable is ok
    }
    bIsForwardTab = false;

    sal_Int32 n, nLen = sSrchStr.getLength();
    pJumpTable.reset( new TextSearchJumpTable );

    for( n = nLen-1; n > 0; --n )
    {
        sal_Unicode cCh = sSrchStr[n];
        TextSearchJumpTable::value_type aEntry( cCh, n );
        ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
            pJumpTable->insert( aEntry );
        if ( !aPair.second )
            (*(aPair.first)).second = n;
    }
}

void TextSearch::MakeBackwardTab2()
{
    // create the jumptable for the search text
    if( pJumpTable2 && !bIsForwardTab )
    {
        return;    // the jumpTable is ok
    }
    bIsForwardTab = false;

    sal_Int32 n, nLen = sSrchStr2.getLength();
    pJumpTable2.reset( new TextSearchJumpTable );

    for( n = nLen-1; n > 0; --n )
    {
        sal_Unicode cCh = sSrchStr2[n];
        TextSearchJumpTable::value_type aEntry( cCh, n );
        ::std::pair< TextSearchJumpTable::iterator, bool > aPair =
            pJumpTable2->insert( aEntry );
        if ( !aPair.second )
            (*(aPair.first)).second = n;
    }
}

sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const
{
    TextSearchJumpTable *pJump;
    OUString sSearchKey;

    if ( bUsePrimarySrchStr ) {
        pJump = pJumpTable.get();
        sSearchKey = sSrchStr;
    } else {
        pJump = pJumpTable2.get();
        sSearchKey = sSrchStr2;
    }

    TextSearchJumpTable::const_iterator iLook = pJump->find( cChr );
    if ( iLook == pJump->end() )
        return sSearchKey.getLength();
    return (*iLook).second;
}


SearchResult TextSearch::NSrchFrwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
{
    SearchResult aRet;
    aRet.subRegExpressions = 0;

    OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;

    sal_Int32 nSuchIdx = searchStr.getLength();
    sal_Int32 nEnd = endPos;
    if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx )
        return aRet;


    if( nEnd < sSearchKey.getLength() )   // position inside the search region ?
        return aRet;

    nEnd -= sSearchKey.getLength();

    if (bUsePrimarySrchStr)
      MakeForwardTab();                   // create the jumptable
    else
      MakeForwardTab2();

    for (sal_Int32 nCmpIdx = startPos; // start position for the search
            nCmpIdx <= nEnd;
            nCmpIdx += GetDiff( searchStr[nCmpIdx + sSearchKey.getLength()-1]))
    {
        nSuchIdx = sSearchKey.getLength() - 1;
        while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == searchStr[nCmpIdx + nSuchIdx])
        {
            if( nSuchIdx == 0 )
            {
                if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
                {
                    sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength();
                    bool bAtStart = !nCmpIdx;
                    bool bAtEnd = nFndEnd == endPos;
                    bool bDelimBefore = bAtStart || IsDelimiter( searchStr, nCmpIdx-1 );
                    bool bDelimBehind = bAtEnd || IsDelimiter(  searchStr, nFndEnd );
                    //  *       1 -> only one word in the paragraph
                    //  *       2 -> at begin of paragraph
                    //  *       3 -> at end of paragraph
                    //  *       4 -> inside the paragraph
                    if( !(  ( bAtStart && bAtEnd ) ||           // 1
                                ( bAtStart && bDelimBehind ) ||     // 2
                                ( bAtEnd && bDelimBefore ) ||       // 3
                                ( bDelimBefore && bDelimBehind )))  // 4
                        break;
                }

                aRet.subRegExpressions = 1;
                aRet.startOffset = { nCmpIdx };
                aRet.endOffset = { nCmpIdx + sSearchKey.getLength() };

                return aRet;
            }
            else
                nSuchIdx--;
        }
    }
    return aRet;
}

SearchResult TextSearch::NSrchBkwrd( std::unique_lock<std::mutex>& /*rGuard*/,const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos )
{
    SearchResult aRet;
    aRet.subRegExpressions = 0;

    OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2;

    sal_Int32 nSuchIdx = searchStr.getLength();
    sal_Int32 nEnd = endPos;
    if( nSuchIdx == 0 || sSearchKey.isEmpty() || sSearchKey.getLength() > nSuchIdx)
        return aRet;

    if (bUsePrimarySrchStr)
        MakeBackwardTab();                  // create the jumptable
    else
        MakeBackwardTab2();

    if( nEnd == nSuchIdx )                  // end position for the search
        nEnd = sSearchKey.getLength();
    else
        nEnd += sSearchKey.getLength();

    sal_Int32 nCmpIdx = startPos;          // start position for the search

    while (nCmpIdx >= nEnd)
    {
        nSuchIdx = 0;
        while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] ==
                searchStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] )
            nSuchIdx++;
        if( nSuchIdx >= sSearchKey.getLength() )
        {
            if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag )
            {
                sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength();
                bool bAtStart = !nFndStt;
                bool bAtEnd = nCmpIdx == startPos;
                bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nCmpIdx );
                bool bDelimBefore = bAtStart || // begin of paragraph
                    IsDelimiter( searchStr, nFndStt-1 );
                //  *       1 -> only one word in the paragraph
                //  *       2 -> at begin of paragraph
                //  *       3 -> at end of paragraph
                //  *       4 -> inside the paragraph
                if( ( bAtStart && bAtEnd ) ||           // 1
                        ( bAtStart && bDelimBehind ) ||     // 2
                        ( bAtEnd && bDelimBefore ) ||       // 3
                        ( bDelimBefore && bDelimBehind ))   // 4
                {
                    aRet.subRegExpressions = 1;
                    aRet.startOffset = { nCmpIdx };
                    aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
                    return aRet;
                }
            }
            else
            {
                aRet.subRegExpressions = 1;
                aRet.startOffset = { nCmpIdx };
                aRet.endOffset = { nCmpIdx - sSearchKey.getLength() };
                return aRet;
            }
        }
        nSuchIdx = GetDiff( searchStr[nCmpIdx - sSearchKey.getLength()] );
        if( nCmpIdx < nSuchIdx )
            return aRet;
        nCmpIdx -= nSuchIdx;
    }
    return aRet;
}

void TextSearch::RESrchPrepare( const css::util::SearchOptions2& rOptions)
{
    TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(rOptions.transliterateFlags);
    // select the transliterated pattern string
    const OUString& rPatternStr =
        (isSimpleTrans(transliterateFlags) ? sSrchStr
        : (isComplexTrans(transliterateFlags) ? sSrchStr2 : rOptions.searchString));

    sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability
    // map css::util::SearchFlags to ICU uregex.h flags
    // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
    // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
    // REG_NOSUB is not used anywhere => not implemented
    // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
    // LEV_RELAXED is only used for SearchAlgorithm==Approximate
    // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO
    // probably because the transliteration flag IGNORE_CASE handles it as well.
    if( (rOptions.searchFlag & css::util::SearchFlags::ALL_IGNORE_CASE) != 0
    ||  (transliterateFlags & TransliterationFlags::IGNORE_CASE))
        nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
    UErrorCode nIcuErr = U_ZERO_ERROR;
    // assumption: transliteration didn't mangle regexp control chars
    icu::UnicodeString aIcuSearchPatStr( reinterpret_cast<const UChar*>(rPatternStr.getStr()), rPatternStr.getLength());
#ifndef DISABLE_WORDBOUND_EMULATION
    // for convenience specific syntax elements of the old regex engine are emulated
    // - by replacing \< with "word-break followed by a look-ahead word-char"
    static const icu::UnicodeString aChevronPatternB( "\\\\<", -1, icu::UnicodeString::kInvariant);
    static const icu::UnicodeString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, icu::UnicodeString::kInvariant);
    static icu::RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr);
    aChevronMatcherB.reset( aIcuSearchPatStr);
    aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr);
    aChevronMatcherB.reset();
    // - by replacing \> with "look-behind word-char followed by a word-break"
    static const icu::UnicodeString aChevronPatternE( "\\\\>", -1, icu::UnicodeString::kInvariant);
    static const icu::UnicodeString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, icu::UnicodeString::kInvariant);
    static icu::RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr);
    aChevronMatcherE.reset( aIcuSearchPatStr);
    aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr);
    aChevronMatcherE.reset();
#endif
    pRegexMatcher.reset( new icu::RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr) );
    if (nIcuErr)
    {
        SAL_INFO( "i18npool", "TextSearch::RESrchPrepare UErrorCode " << nIcuErr);
        pRegexMatcher.reset();
    }
    else
    {
        // Pathological patterns may result in exponential run time making the
        // application appear to be frozen. Limit that. Documentation for this
        // call says
        // https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00
        // "The units of the limit are steps of the match engine.
        // Correspondence with actual processor time will depend on the speed
        // of the processor and the details of the specific pattern, but will
        // typically be on the order of milliseconds."
        // Just what is a good value? 42 is always an answer ... the 23 enigma
        // as well... which on the dev's machine is roughly 50 seconds with the
        // pattern of fdo#70627.
        /* TODO: make this a configuration settable value and possibly take
         * complexity of expression into account and maybe even length of text
         * to be matched; currently (2013-11-25) that is at most one 64k
         * paragraph per RESrchFrwrd()/RESrchBkwrd() call. */
        pRegexMatcher->setTimeLimit( 23*1000, nIcuErr);
    }
}


static bool lcl_findRegex(std::unique_ptr<icu::RegexMatcher> const& pRegexMatcher,
                          sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode& rIcuErr)
{
    pRegexMatcher->region(nStartPos, nEndPos, rIcuErr);
    pRegexMatcher->useAnchoringBounds(false); // use whole text's anchoring bounds, not region's
    pRegexMatcher->useTransparentBounds(true); // take text outside of the region into account for
                                               // look-ahead/behind assertions

    if (!pRegexMatcher->find(rIcuErr))
    {
        /* TODO: future versions could pass the UErrorCode or translations
         * thereof to the caller, for example to inform the user of
         * U_REGEX_TIME_OUT. The strange thing though is that an error is set
         * only after the second call that returns immediately and not if
         * timeout occurred on the first call?!? */
        SAL_INFO( "i18npool", "lcl_findRegex UErrorCode " << rIcuErr);
        return false;
    }
    return true;
}

SearchResult TextSearch::RESrchFrwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr,
                                      sal_Int32 startPos, sal_Int32 endPos )
{
    SearchResult aRet;
    aRet.subRegExpressions = 0;
    if( !pRegexMatcher)
        return aRet;

    if( endPos > searchStr.getLength())
        endPos = searchStr.getLength();

    // use the ICU RegexMatcher to find the matches
    UErrorCode nIcuErr = U_ZERO_ERROR;
    const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
                                        searchStr.getLength());
    pRegexMatcher->reset( aSearchTargetStr);
    // search until there is a valid match
    for(;;)
    {
        if (!lcl_findRegex( pRegexMatcher, startPos, endPos, nIcuErr))
            return aRet;

        // #i118887# ignore zero-length matches e.g. "a*" in "bc"
        int nStartOfs = pRegexMatcher->start( nIcuErr);
        int nEndOfs = pRegexMatcher->end( nIcuErr);
        if( nStartOfs < nEndOfs)
            break;
        // If the zero-length match is behind the string, do not match it again
        // and again until startPos reaches there. A match behind the string is
        // a "$" anchor.
        if (nStartOfs == endPos)
            break;
        // try at next position if there was a zero-length match
        if( ++startPos >= endPos)
            return aRet;
    }

    // extract the result of the search
    const int nGroupCount = pRegexMatcher->groupCount();
    aRet.subRegExpressions = nGroupCount + 1;
    aRet.startOffset.realloc( aRet.subRegExpressions);
    auto pstartOffset = aRet.startOffset.getArray();
    aRet.endOffset.realloc( aRet.subRegExpressions);
    auto pendOffset = aRet.endOffset.getArray();
    pstartOffset[0] = pRegexMatcher->start( nIcuErr);
    pendOffset[0]   = pRegexMatcher->end( nIcuErr);
    for( int i = 1; i <= nGroupCount; ++i) {
        pstartOffset[i] = pRegexMatcher->start( i, nIcuErr);
        pendOffset[i]   = pRegexMatcher->end( i, nIcuErr);
    }

    return aRet;
}

SearchResult TextSearch::RESrchBkwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr,
                                      sal_Int32 startPos, sal_Int32 endPos )
{
    // NOTE: for backwards search callers provide startPos/endPos inverted!
    SearchResult aRet;
    aRet.subRegExpressions = 0;
    if( !pRegexMatcher)
        return aRet;

    if( startPos > searchStr.getLength())
        startPos = searchStr.getLength();

    // use the ICU RegexMatcher to find the matches
    // TODO: use ICU's backward searching once it becomes available
    //       as its replacement using forward search is not as good as the real thing
    UErrorCode nIcuErr = U_ZERO_ERROR;
    const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()),
                                        searchStr.getLength());
    pRegexMatcher->reset( aSearchTargetStr);
    if (!lcl_findRegex( pRegexMatcher, endPos, startPos, nIcuErr))
        return aRet;

    // find the last match
    int nLastPos = 0;
    int nFoundEnd = 0;
    int nGoodPos = 0, nGoodEnd = 0;
    bool bFirst = true;
    do {
        nLastPos = pRegexMatcher->start( nIcuErr);
        nFoundEnd = pRegexMatcher->end( nIcuErr);
        if (nLastPos < nFoundEnd)
        {
            // remember last non-zero-length match
            nGoodPos = nLastPos;
            nGoodEnd = nFoundEnd;
        }
        if( nFoundEnd >= startPos)
            break;
        bFirst = false;
        if( nFoundEnd == nLastPos)
            ++nFoundEnd;
    } while( lcl_findRegex( pRegexMatcher, nFoundEnd, startPos, nIcuErr));

    // Ignore all zero-length matches except "$" anchor on first match.
    if (nGoodPos == nGoodEnd)
    {
        if (bFirst && nLastPos == startPos)
            nGoodPos = nLastPos;
        else
            return aRet;
    }

    // find last match again to get its details
    lcl_findRegex( pRegexMatcher, nGoodPos, startPos, nIcuErr);

    // fill in the details of the last match
    const int nGroupCount = pRegexMatcher->groupCount();
    aRet.subRegExpressions = nGroupCount + 1;
    aRet.startOffset.realloc( aRet.subRegExpressions);
    auto pstartOffset = aRet.startOffset.getArray();
    aRet.endOffset.realloc( aRet.subRegExpressions);
    auto pendOffset = aRet.endOffset.getArray();
    // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted!
    pstartOffset[0] = pRegexMatcher->end( nIcuErr);
    pendOffset[0]   = pRegexMatcher->start( nIcuErr);
    for( int i = 1; i <= nGroupCount; ++i) {
        pstartOffset[i] = pRegexMatcher->end( i, nIcuErr);
        pendOffset[i]   = pRegexMatcher->start( i, nIcuErr);
    }

    return aRet;
}


// search for words phonetically
SearchResult TextSearch::ApproxSrchFrwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr,
                                          sal_Int32 startPos, sal_Int32 endPos )
{
    SearchResult aRet;
    aRet.subRegExpressions = 0;

    if( !xBreak.is() )
        return aRet;

    sal_Int32 nStt, nEnd;

    Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
            aSrchPara.Locale,
            WordType::ANYWORD_IGNOREWHITESPACES, true );

    do
    {
        if( aWBnd.startPos >= endPos )
            break;
        nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos;
        nEnd = std::min(aWBnd.endPos, endPos);

        if( nStt < nEnd &&
                pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
        {
            aRet.subRegExpressions = 1;
            aRet.startOffset = { nStt };
            aRet.endOffset = { nEnd };
            break;
        }

        nStt = nEnd - 1;
        aWBnd = xBreak->nextWord( searchStr, nStt, aSrchPara.Locale,
                WordType::ANYWORD_IGNOREWHITESPACES);
    } while( aWBnd.startPos != aWBnd.endPos ||
            (aWBnd.endPos != searchStr.getLength() && aWBnd.endPos != nEnd) );
    // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only
    // whitespace) in searchStr, getWordBoundary() returned startPos,startPos
    // and nextWord() does also => don't loop forever.
    return aRet;
}

SearchResult TextSearch::ApproxSrchBkwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr,
                                          sal_Int32 startPos, sal_Int32 endPos )
{
    SearchResult aRet;
    aRet.subRegExpressions = 0;

    if( !xBreak.is() )
        return aRet;

    sal_Int32 nStt, nEnd;

    Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos,
            aSrchPara.Locale,
            WordType::ANYWORD_IGNOREWHITESPACES, true );

    do
    {
        if( aWBnd.endPos <= endPos )
            break;
        nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos;
        nEnd = std::min(aWBnd.endPos, startPos);

        if( nStt < nEnd &&
                pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit )
        {
            aRet.subRegExpressions = 1;
            aRet.startOffset = { nEnd };
            aRet.endOffset = { nStt };
            break;
        }
        if( !nStt )
            break;

        aWBnd = xBreak->previousWord( searchStr, nStt, aSrchPara.Locale,
                WordType::ANYWORD_IGNOREWHITESPACES);
    } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != searchStr.getLength() );
    return aRet;
}


namespace {
void setWildcardMatch( css::util::SearchResult& rRes, sal_Int32 nStartOffset, sal_Int32 nEndOffset )
{
    rRes.subRegExpressions = 1;
    rRes.startOffset = { nStartOffset };
    rRes.endOffset = { nEndOffset };
}
}

SearchResult TextSearch::WildcardSrchFrwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
{
    SearchResult aRes;
    aRes.subRegExpressions = 0;     // no match
    sal_Int32 nStartOffset = nStartPos;
    sal_Int32 nEndOffset = nEndPos;

    const sal_Int32 nStringLen = searchStr.getLength();

    // Forward nStartPos inclusive, nEndPos exclusive, but allow for empty
    // string match with [0,0).
    if (nStartPos < 0 || nEndPos > nStringLen || nEndPos < nStartPos ||
            (nStartPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
        return aRes;

    const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
    const sal_Int32 nPatternLen = rPattern.getLength();

    // Handle special cases empty pattern and/or string outside of the loop to
    // not add performance penalties there and simplify.
    if (nStartPos == nEndPos)
    {
        sal_Int32 i = 0;
        while (i < nPatternLen && rPattern[i] == '*')
            ++i;
        if (i == nPatternLen)
            setWildcardMatch( aRes, nStartOffset, nEndOffset);
        return aRes;
    }

    // Empty pattern does not match any non-empty string.
    if (!nPatternLen)
        return aRes;

    bool bRewind = false;
    sal_uInt32 cPattern = 0;
    sal_Int32 nPattern = 0;
    sal_Int32 nAfterFakePattern = nPattern;
    if (mbWildcardAllowSubstring)
    {
        // Fake a leading '*' wildcard.
        cPattern = '*';
        bRewind = true;
        // Assume a non-'*' pattern character follows. If it is a '*' instead
        // that will be handled in the loop by setting nPat.
        sal_uInt32 cu = rPattern.iterateCodePoints( &nAfterFakePattern);
        if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern < nPatternLen)
            rPattern.iterateCodePoints( &nAfterFakePattern);
    }

    sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
    sal_uInt32 cPatternAfterAsterisk = 0;
    bool bEscaped = false, bEscapedAfterAsterisk = false;

    // The loop code tries to avoid costly calls to iterateCodePoints() when
    // possible.

    do
    {
        if (bRewind)
        {
            // Reuse cPattern after '*', nPattern was correspondingly
            // incremented to point behind cPattern.
            bRewind = false;
        }
        else if (nPattern < nPatternLen)
        {
            // nPattern will be incremented by iterateCodePoints().
            cPattern = rPattern.iterateCodePoints( &nPattern);
            if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
            {
                bEscaped = true;
                cPattern = rPattern.iterateCodePoints( &nPattern);
            }
        }
        else
        {
            // A trailing '*' is handled below.
            if (mbWildcardAllowSubstring)
            {
                // If the pattern is consumed and substring match allowed we're good.
                setWildcardMatch( aRes, nStartOffset, nString);
                return aRes;
            }
            else if (nString < nEndPos && nLastAsterisk >= 0)
            {
                // If substring match is not allowed try a greedy '*' match.
                nPattern = nLastAsterisk;
                continue;   // do
            }
            else
                return aRes;
        }

        if (cPattern == '*' && !bEscaped)
        {
            // '*' is one code unit, so not using iterateCodePoints() is ok.
            while (nPattern < nPatternLen && rPattern[nPattern] == '*')
                ++nPattern;

            if (nPattern >= nPatternLen)
            {
                // Last pattern is '*', remaining string matches.
                setWildcardMatch( aRes, nStartOffset, nEndOffset);
                return aRes;
            }

            nLastAsterisk = nPattern;   // Remember last encountered '*'.

            // cPattern will be the next non-'*' character, nPattern
            // incremented.
            cPattern = rPattern.iterateCodePoints( &nPattern);
            if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen)
            {
                bEscaped = true;
                cPattern = rPattern.iterateCodePoints( &nPattern);
            }

            cPatternAfterAsterisk = cPattern;
            bEscapedAfterAsterisk = bEscaped;
            nPat = nPattern;    // Remember position of pattern behind '*', already incremented.
            nStr = nString;     // Remember the current string to be matched.
        }

        if (nString >= nEndPos)
            // Whatever follows in pattern, string will not match.
            return aRes;

        // nString will be incremented by iterateCodePoints().
        sal_uInt32 cString = searchStr.iterateCodePoints( &nString);

        if ((cPattern != '?' || bEscaped) && cPattern != cString)
        {
            if (nPat == -1)
                // Non-match already without any '*' pattern.
                return aRes;

            bRewind = true;
            nPattern = nPat;                    // Rewind pattern to character behind '*', already incremented.
            cPattern = cPatternAfterAsterisk;
            bEscaped = bEscapedAfterAsterisk;
            searchStr.iterateCodePoints( &nStr);
            nString = nStr;                     // Restore incremented remembered string position.
            if (nPat == nAfterFakePattern)
            {
                // Next start offset will be the next character.
                nStartOffset = nString;
            }
        }
        else
        {
            // An unescaped '?' pattern matched any character, or characters
            // matched. Reset only escaped state.
            bEscaped = false;
        }
    }
    while (nString < nEndPos);

    if (bRewind)
        return aRes;

    // Eat trailing '*' pattern that matches anything, including nothing.
    // '*' is one code unit, so not using iterateCodePoints() is ok.
    while (nPattern < nPatternLen && rPattern[nPattern] == '*')
        ++nPattern;

    if (nPattern == nPatternLen)
        setWildcardMatch( aRes, nStartOffset, nEndOffset);
    return aRes;
}

SearchResult TextSearch::WildcardSrchBkwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos )
{
    SearchResult aRes;
    aRes.subRegExpressions = 0;     // no match

    sal_Int32 nStartOffset = nStartPos;
    sal_Int32 nEndOffset = nEndPos;

    const sal_Int32 nStringLen = searchStr.getLength();

    // Backward nStartPos exclusive, nEndPos inclusive, but allow for empty
    // string match with (0,0].
    if (nStartPos > nStringLen || nEndPos < 0 || nStartPos < nEndPos ||
            (nEndPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos)))
        return aRes;

    const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2);
    sal_Int32 nPatternLen = rPattern.getLength();

    // Handle special cases empty pattern and/or string outside of the loop to
    // not add performance penalties there and simplify.
    if (nStartPos == nEndPos)
    {
        sal_Int32 i = 0;
        while (i < nPatternLen && rPattern[i] == '*')
            ++i;
        if (i == nPatternLen)
            setWildcardMatch( aRes, nStartOffset, nEndOffset);
        return aRes;
    }

    // Empty pattern does not match any non-empty string.
    if (!nPatternLen)
        return aRes;

    // Reverse escaped patterns to ease the handling of escapes, keeping escape
    // and following character as one sequence in backward direction.
    if ((bUsePrimarySrchStr && maWildcardReversePattern.isEmpty()) ||
            (!bUsePrimarySrchStr && maWildcardReversePattern2.isEmpty()))
    {
        OUStringBuffer aPatternBuf( rPattern);
        sal_Int32 nIndex = 0;
        while (nIndex < nPatternLen)
        {
            const sal_Int32 nOld = nIndex;
            const sal_uInt32 cu = rPattern.iterateCodePoints( &nIndex);
            if (cu == mcWildcardEscapeChar)
            {
                if (nIndex < nPatternLen)
                {
                    if (nIndex - nOld == 1)
                    {
                        // Simply move code units, we already memorized the one
                        // in 'cu'.
                        const sal_Int32 nOld2 = nIndex;
                        rPattern.iterateCodePoints( &nIndex);
                        for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
                            aPatternBuf[nOld+i] = rPattern[nOld2+i];
                        aPatternBuf[nIndex-1] = static_cast<sal_Unicode>(cu);
                    }
                    else
                    {
                        // Copy the escape character code units first in the
                        // unlikely case that it would not be of BMP.
                        assert(nIndex - nOld == 2);  // it's UTF-16, so...
                        sal_Unicode buf[2];
                        buf[0] = rPattern[nOld];
                        buf[1] = rPattern[nOld+1];
                        const sal_Int32 nOld2 = nIndex;
                        rPattern.iterateCodePoints( &nIndex);
                        for (sal_Int32 i=0; i < nIndex - nOld2; ++i)
                            aPatternBuf[nOld+i] = rPattern[nOld2+i];
                        aPatternBuf[nIndex-2] = buf[0];
                        aPatternBuf[nIndex-1] = buf[1];
                    }
                }
                else
                {
                    // Trailing escape would become leading escape, do what?
                    // Eliminate.
                    aPatternBuf.remove( nOld, nIndex - nOld);
                }
            }
        }
        if (bUsePrimarySrchStr)
            maWildcardReversePattern = aPatternBuf.makeStringAndClear();
        else
            maWildcardReversePattern2 = aPatternBuf.makeStringAndClear();
    }
    const OUString& rReversePattern = (bUsePrimarySrchStr ? maWildcardReversePattern : maWildcardReversePattern2);
    nPatternLen = rReversePattern.getLength();

    bool bRewind = false;
    sal_uInt32 cPattern = 0;
    sal_Int32 nPattern = nPatternLen;
    sal_Int32 nAfterFakePattern = nPattern;
    if (mbWildcardAllowSubstring)
    {
        // Fake a trailing '*' wildcard.
        cPattern = '*';
        bRewind = true;
        // Assume a non-'*' pattern character follows. If it is a '*' instead
        // that will be handled in the loop by setting nPat.
        sal_uInt32 cu = rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
        if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern > 0)
            rReversePattern.iterateCodePoints( &nAfterFakePattern, -1);
    }

    sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1;
    sal_uInt32 cPatternAfterAsterisk = 0;
    bool bEscaped = false, bEscapedAfterAsterisk = false;

    // The loop code tries to avoid costly calls to iterateCodePoints() when
    // possible.

    do
    {
        if (bRewind)
        {
            // Reuse cPattern after '*', nPattern was correspondingly
            // decremented to point before cPattern.
            bRewind = false;
        }
        else if (nPattern > 0)
        {
            // nPattern will be decremented by iterateCodePoints().
            cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
            if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
            {
                bEscaped = true;
                cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
            }
        }
        else
        {
            // A trailing '*' is handled below.
            if (mbWildcardAllowSubstring)
            {
                // If the pattern is consumed and substring match allowed we're good.
                setWildcardMatch( aRes, nStartOffset, nString);
                return aRes;
            }
            else if (nString > nEndPos && nLastAsterisk >= 0)
            {
                // If substring match is not allowed try a greedy '*' match.
                nPattern = nLastAsterisk;
                continue;   // do
            }
            else
                return aRes;
        }

        if (cPattern == '*' && !bEscaped)
        {
            // '*' is one code unit, so not using iterateCodePoints() is ok.
            while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
                --nPattern;

            if (nPattern <= 0)
            {
                // First pattern is '*', remaining string matches.
                setWildcardMatch( aRes, nStartOffset, nEndOffset);
                return aRes;
            }

            nLastAsterisk = nPattern;   // Remember last encountered '*'.

            // cPattern will be the previous non-'*' character, nPattern
            // decremented.
            cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
            if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0)
            {
                bEscaped = true;
                cPattern = rReversePattern.iterateCodePoints( &nPattern, -1);
            }

            cPatternAfterAsterisk = cPattern;
            bEscapedAfterAsterisk = bEscaped;
            nPat = nPattern;    // Remember position of pattern before '*', already decremented.
            nStr = nString;     // Remember the current string to be matched.
        }

        if (nString <= nEndPos)
            // Whatever leads in pattern, string will not match.
            return aRes;

        // nString will be decremented by iterateCodePoints().
        sal_uInt32 cString = searchStr.iterateCodePoints( &nString, -1);

        if ((cPattern != '?' || bEscaped) && cPattern != cString)
        {
            if (nPat == -1)
                // Non-match already without any '*' pattern.
                return aRes;

            bRewind = true;
            nPattern = nPat;                    // Rewind pattern to character before '*', already decremented.
            cPattern = cPatternAfterAsterisk;
            bEscaped = bEscapedAfterAsterisk;
            searchStr.iterateCodePoints( &nStr, -1);
            nString = nStr;                     // Restore decremented remembered string position.
            if (nPat == nAfterFakePattern)
            {
                // Next start offset will be this character (exclusive).
                nStartOffset = nString;
            }
        }
        else
        {
            // An unescaped '?' pattern matched any character, or characters
            // matched. Reset only escaped state.
            bEscaped = false;
        }
    }
    while (nString > nEndPos);

    if (bRewind)
        return aRes;

    // Eat leading '*' pattern that matches anything, including nothing.
    // '*' is one code unit, so not using iterateCodePoints() is ok.
    while (nPattern > 0 && rReversePattern[nPattern-1] == '*')
        --nPattern;

    if (nPattern == 0)
        setWildcardMatch( aRes, nStartOffset, nEndOffset);
    return aRes;
}


OUString SAL_CALL
TextSearch::getImplementationName()
{
    return u"com.sun.star.util.TextSearch_i18n"_ustr;
}

sal_Bool SAL_CALL TextSearch::supportsService(const OUString& rServiceName)
{
    return cppu::supportsService(this, rServiceName);
}

Sequence< OUString > SAL_CALL
TextSearch::getSupportedServiceNames()
{
    return { u"com.sun.star.util.TextSearch"_ustr, u"com.sun.star.util.TextSearch2"_ustr };
}

extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
i18npool_TextSearch_get_implementation(
    css::uno::XComponentContext* context , css::uno::Sequence<css::uno::Any> const&)
{
    return cppu::acquire(new TextSearch(context));
}

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Coverage Report

Created: 2026-02-14 09:37