/src/libreoffice/i18npool/source/search/textsearch.cxx
Line | Count | Source |
1 | | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | /* |
3 | | * This file is part of the LibreOffice project. |
4 | | * |
5 | | * This Source Code Form is subject to the terms of the Mozilla Public |
6 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
7 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8 | | * |
9 | | * This file incorporates work covered by the following license notice: |
10 | | * |
11 | | * Licensed to the Apache Software Foundation (ASF) under one or more |
12 | | * contributor license agreements. See the NOTICE file distributed |
13 | | * with this work for additional information regarding copyright |
14 | | * ownership. The ASF licenses this file to you under the Apache |
15 | | * License, Version 2.0 (the "License"); you may not use this file |
16 | | * except in compliance with the License. You may obtain a copy of |
17 | | * the License at http://www.apache.org/licenses/LICENSE-2.0 . |
18 | | */ |
19 | | |
20 | | #include "textsearch.hxx" |
21 | | #include "levdis.hxx" |
22 | | #include <com/sun/star/i18n/BreakIterator.hpp> |
23 | | #include <com/sun/star/util/SearchAlgorithms2.hpp> |
24 | | #include <com/sun/star/util/SearchFlags.hpp> |
25 | | #include <com/sun/star/i18n/WordType.hpp> |
26 | | #include <com/sun/star/i18n/ScriptType.hpp> |
27 | | #include <com/sun/star/i18n/CharacterIteratorMode.hpp> |
28 | | #include <com/sun/star/i18n/CharacterClassification.hpp> |
29 | | #include <com/sun/star/i18n/KCharacterType.hpp> |
30 | | #include <com/sun/star/i18n/Transliteration.hpp> |
31 | | #include <cppuhelper/supportsservice.hxx> |
32 | | #include <cppuhelper/weak.hxx> |
33 | | #include <i18nutil/transliteration.hxx> |
34 | | #include <rtl/ustrbuf.hxx> |
35 | | #include <sal/log.hxx> |
36 | | |
37 | | #include <unicode/regex.h> |
38 | | |
39 | | using namespace ::com::sun::star::util; |
40 | | using namespace ::com::sun::star::uno; |
41 | | using namespace ::com::sun::star::lang; |
42 | | using namespace ::com::sun::star::i18n; |
43 | | using namespace ::com::sun::star; |
44 | | |
45 | | const TransliterationFlags COMPLEX_TRANS_MASK = |
46 | | TransliterationFlags::ignoreBaFa_ja_JP | |
47 | | TransliterationFlags::ignoreIterationMark_ja_JP | |
48 | | TransliterationFlags::ignoreTiJi_ja_JP | |
49 | | TransliterationFlags::ignoreHyuByu_ja_JP | |
50 | | TransliterationFlags::ignoreSeZe_ja_JP | |
51 | | TransliterationFlags::ignoreIandEfollowedByYa_ja_JP | |
52 | | TransliterationFlags::ignoreKiKuFollowedBySa_ja_JP | |
53 | | TransliterationFlags::ignoreProlongedSoundMark_ja_JP; |
54 | | |
55 | | namespace |
56 | | { |
57 | | TransliterationFlags maskComplexTrans( TransliterationFlags n ) |
58 | 0 | { |
59 | | // IGNORE_KANA and FULLWIDTH_HALFWIDTH are simple but need to take effect |
60 | | // in complex transliteration. |
61 | 0 | return |
62 | 0 | n & (COMPLEX_TRANS_MASK | // all set ignore bits |
63 | 0 | TransliterationFlags::IGNORE_KANA | // plus IGNORE_KANA bit |
64 | 0 | TransliterationFlags::FULLWIDTH_HALFWIDTH); // and the FULLWIDTH_HALFWIDTH value |
65 | 0 | } |
66 | | |
67 | | bool isComplexTrans( TransliterationFlags n ) |
68 | 6 | { |
69 | 6 | return bool(n & COMPLEX_TRANS_MASK); |
70 | 6 | } |
71 | | |
72 | | TransliterationFlags maskSimpleTrans( TransliterationFlags n ) |
73 | 12 | { |
74 | 12 | return n & ~COMPLEX_TRANS_MASK; |
75 | 12 | } |
76 | | |
77 | | bool isSimpleTrans( TransliterationFlags n ) |
78 | 9 | { |
79 | 9 | return bool(maskSimpleTrans(n)); |
80 | 9 | } |
81 | | |
82 | | // Regex patterns are case sensitive. |
83 | | TransliterationFlags maskSimpleRegexTrans( TransliterationFlags n ) |
84 | 0 | { |
85 | 0 | TransliterationFlags m = (n & TransliterationFlags::IGNORE_MASK) & ~TransliterationFlags::IGNORE_CASE; |
86 | 0 | TransliterationFlags v = n & TransliterationFlags::NON_IGNORE_MASK; |
87 | 0 | if (v == TransliterationFlags::UPPERCASE_LOWERCASE || v == TransliterationFlags::LOWERCASE_UPPERCASE) |
88 | 0 | v = TransliterationFlags::NONE; |
89 | 0 | return (m | v) & ~COMPLEX_TRANS_MASK; |
90 | 0 | } |
91 | | |
92 | | bool isSimpleRegexTrans( TransliterationFlags n ) |
93 | 0 | { |
94 | 0 | return bool(maskSimpleRegexTrans(n)); |
95 | 0 | } |
96 | | |
97 | | bool isReplacePunctuation( std::u16string_view rStr ) |
98 | 0 | { |
99 | 0 | return rStr.find(u'\u2018') != std::u16string_view::npos || |
100 | 0 | rStr.find(u'\u2019') != std::u16string_view::npos || |
101 | 0 | rStr.find(u'\u201A') != std::u16string_view::npos || |
102 | 0 | rStr.find(u'\u201B') != std::u16string_view::npos || |
103 | 0 | rStr.find(u'\u201C') != std::u16string_view::npos || |
104 | 0 | rStr.find(u'\u201D') != std::u16string_view::npos || |
105 | 0 | rStr.find(u'\u201E') != std::u16string_view::npos || |
106 | 0 | rStr.find(u'\u201F') != std::u16string_view::npos; |
107 | 0 | } |
108 | | |
109 | | OUString replacePunctuation( const OUString &rStr ) |
110 | 0 | { |
111 | 0 | return rStr.replace(u'\u2018', '\'') |
112 | 0 | .replace(u'\u2019', '\'') |
113 | 0 | .replace(u'\u201A', '\'') |
114 | 0 | .replace(u'\u201B', '\'') |
115 | 0 | .replace(u'\u201C', '"') |
116 | 0 | .replace(u'\u201D', '"') |
117 | 0 | .replace(u'\u201E', '"') |
118 | 0 | .replace(u'\u201F', '"'); |
119 | 0 | } |
120 | | }; |
121 | | |
122 | | TextSearch::TextSearch(const Reference < XComponentContext > & rxContext) |
123 | 3 | : m_xContext( rxContext ) |
124 | 3 | { |
125 | 3 | SearchOptions2 aOpt; |
126 | 3 | aOpt.AlgorithmType2 = SearchAlgorithms2::ABSOLUTE; |
127 | 3 | aOpt.algorithmType = SearchAlgorithms_ABSOLUTE; |
128 | 3 | aOpt.searchFlag = SearchFlags::ALL_IGNORE_CASE; |
129 | | //aOpt.Locale = ???; |
130 | 3 | setOptions( aOpt ); |
131 | 3 | } |
132 | | |
133 | | TextSearch::~TextSearch() |
134 | 3 | { |
135 | 3 | pRegexMatcher.reset(); |
136 | 3 | pWLD.reset(); |
137 | 3 | pJumpTable.reset(); |
138 | 3 | pJumpTable2.reset(); |
139 | 3 | } |
140 | | |
141 | | void TextSearch::setOptions2( const SearchOptions2& rOptions ) |
142 | 6 | { |
143 | 6 | std::unique_lock g(m_aMutex); |
144 | | |
145 | 6 | aSrchPara = rOptions; |
146 | | |
147 | 6 | pRegexMatcher.reset(); |
148 | 6 | pWLD.reset(); |
149 | 6 | pJumpTable.reset(); |
150 | 6 | pJumpTable2.reset(); |
151 | 6 | maWildcardReversePattern.clear(); |
152 | 6 | maWildcardReversePattern2.clear(); |
153 | 6 | TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(aSrchPara.transliterateFlags); |
154 | 6 | bSearchApostrophe = false; |
155 | 6 | bool bReplaceApostrophe = false; |
156 | 6 | if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP) |
157 | 0 | { |
158 | | // RESrchPrepare will consider aSrchPara.transliterateFlags when |
159 | | // picking the actual regex pattern |
160 | | // (sSrchStr|sSrchStr2|SearchOptions2::searchString) and setting |
161 | | // case-insensitivity. Create transliteration instance, if any, without |
162 | | // ignore-case so later in TextSearch::searchForward() the string to |
163 | | // match is not case-altered, leave case-(in)sensitive to regex engine. |
164 | 0 | transliterateFlags &= ~TransliterationFlags::IGNORE_CASE; |
165 | 0 | } |
166 | 6 | else if ( aSrchPara.searchString.indexOf('\'') > - 1 || aSrchPara.searchString.indexOf('"') > - 1 ) |
167 | 0 | { |
168 | 0 | bSearchApostrophe = true; |
169 | 0 | bReplaceApostrophe = isReplacePunctuation(aSrchPara.searchString); |
170 | 0 | } |
171 | | |
172 | | // Create Transliteration class |
173 | 6 | if( isSimpleTrans( transliterateFlags) ) |
174 | 3 | { |
175 | 3 | if( !xTranslit.is() ) |
176 | 3 | xTranslit.set( Transliteration::create( m_xContext ) ); |
177 | 3 | xTranslit->loadModule( |
178 | 3 | static_cast<TransliterationModules>(maskSimpleTrans(transliterateFlags)), |
179 | 3 | aSrchPara.Locale); |
180 | 3 | } |
181 | 3 | else if( xTranslit.is() ) |
182 | 0 | xTranslit = nullptr; |
183 | | |
184 | | // Create Transliteration for 2<->1, 2<->2 transliteration |
185 | 6 | if ( isComplexTrans( transliterateFlags) ) |
186 | 0 | { |
187 | 0 | if( !xTranslit2.is() ) |
188 | 0 | xTranslit2.set( Transliteration::create( m_xContext ) ); |
189 | | // Load transliteration module |
190 | 0 | xTranslit2->loadModule( |
191 | 0 | static_cast<TransliterationModules>(maskComplexTrans(transliterateFlags)), |
192 | 0 | aSrchPara.Locale); |
193 | 0 | } |
194 | | |
195 | 6 | if ( !xBreak.is() ) |
196 | 3 | xBreak = css::i18n::BreakIterator::create( m_xContext ); |
197 | | |
198 | 6 | sSrchStr = aSrchPara.searchString; |
199 | | |
200 | | // Transliterate search string. |
201 | 6 | if (aSrchPara.AlgorithmType2 == SearchAlgorithms2::REGEXP) |
202 | 0 | { |
203 | 0 | if (isSimpleRegexTrans(transliterateFlags)) |
204 | 0 | { |
205 | 0 | if (maskSimpleRegexTrans(transliterateFlags) != |
206 | 0 | maskSimpleTrans(transliterateFlags)) |
207 | 0 | { |
208 | 0 | css::uno::Reference< XExtendedTransliteration > xTranslitPattern( |
209 | 0 | Transliteration::create( m_xContext )); |
210 | 0 | if (xTranslitPattern.is()) |
211 | 0 | { |
212 | 0 | xTranslitPattern->loadModule( |
213 | 0 | static_cast<TransliterationModules>(maskSimpleRegexTrans(transliterateFlags)), |
214 | 0 | aSrchPara.Locale); |
215 | 0 | sSrchStr = xTranslitPattern->transliterateString2String( |
216 | 0 | aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); |
217 | 0 | } |
218 | 0 | } |
219 | 0 | else |
220 | 0 | { |
221 | 0 | if (xTranslit.is()) |
222 | 0 | sSrchStr = xTranslit->transliterateString2String( |
223 | 0 | aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); |
224 | 0 | } |
225 | | // xTranslit2 complex transliterated sSrchStr2 is not used in |
226 | | // regex, see TextSearch::searchForward() and |
227 | | // TextSearch::searchBackward() |
228 | 0 | } |
229 | 0 | } |
230 | 6 | else |
231 | 6 | { |
232 | 6 | if ( xTranslit.is() && isSimpleTrans(transliterateFlags) ) |
233 | 3 | sSrchStr = xTranslit->transliterateString2String( |
234 | 3 | aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); |
235 | | |
236 | 6 | if ( xTranslit2.is() && isComplexTrans(transliterateFlags) ) |
237 | 0 | sSrchStr2 = xTranslit2->transliterateString2String( |
238 | 0 | aSrchPara.searchString, 0, aSrchPara.searchString.getLength()); |
239 | 6 | } |
240 | | |
241 | 6 | if ( bReplaceApostrophe ) |
242 | 0 | sSrchStr = replacePunctuation(sSrchStr); |
243 | | |
244 | | // Take the new SearchOptions2::AlgorithmType2 field and ignore |
245 | | // SearchOptions::algorithmType |
246 | 6 | switch( aSrchPara.AlgorithmType2) |
247 | 6 | { |
248 | 0 | case SearchAlgorithms2::REGEXP: |
249 | 0 | fnForward = &TextSearch::RESrchFrwrd; |
250 | 0 | fnBackward = &TextSearch::RESrchBkwrd; |
251 | 0 | RESrchPrepare( aSrchPara); |
252 | 0 | break; |
253 | | |
254 | 0 | case SearchAlgorithms2::APPROXIMATE: |
255 | 0 | fnForward = &TextSearch::ApproxSrchFrwrd; |
256 | 0 | fnBackward = &TextSearch::ApproxSrchBkwrd; |
257 | |
|
258 | 0 | pWLD.reset( new WLevDistance( sSrchStr.getStr(), aSrchPara.changedChars, |
259 | 0 | aSrchPara.insertedChars, aSrchPara.deletedChars, |
260 | 0 | 0 != (SearchFlags::LEV_RELAXED & aSrchPara.searchFlag ) ) ); |
261 | |
|
262 | 0 | nLimit = pWLD->GetLimit(); |
263 | 0 | break; |
264 | | |
265 | 2 | case SearchAlgorithms2::WILDCARD: |
266 | 2 | mcWildcardEscapeChar = static_cast<sal_uInt32>(aSrchPara.WildcardEscapeCharacter); |
267 | 2 | mbWildcardAllowSubstring = ((aSrchPara.searchFlag & SearchFlags::WILD_MATCH_SELECTION) == 0); |
268 | 2 | fnForward = &TextSearch::WildcardSrchFrwrd; |
269 | 2 | fnBackward = &TextSearch::WildcardSrchBkwrd; |
270 | 2 | break; |
271 | | |
272 | 0 | default: |
273 | 0 | SAL_WARN("i18npool","TextSearch::setOptions2 - default what?"); |
274 | 0 | [[fallthrough]]; |
275 | 4 | case SearchAlgorithms2::ABSOLUTE: |
276 | 4 | fnForward = &TextSearch::NSrchFrwrd; |
277 | 4 | fnBackward = &TextSearch::NSrchBkwrd; |
278 | 4 | break; |
279 | 6 | } |
280 | 6 | } |
281 | | |
282 | | void TextSearch::setOptions( const SearchOptions& rOptions ) |
283 | 3 | { |
284 | 3 | sal_Int16 nAlgorithmType2; |
285 | 3 | switch (rOptions.algorithmType) |
286 | 3 | { |
287 | 0 | case SearchAlgorithms_REGEXP: |
288 | 0 | nAlgorithmType2 = SearchAlgorithms2::REGEXP; |
289 | 0 | break; |
290 | 0 | case SearchAlgorithms_APPROXIMATE: |
291 | 0 | nAlgorithmType2 = SearchAlgorithms2::APPROXIMATE; |
292 | 0 | break; |
293 | 0 | default: |
294 | 0 | SAL_WARN("i18npool","TextSearch::setOptions - default what?"); |
295 | 0 | [[fallthrough]]; |
296 | 3 | case SearchAlgorithms_ABSOLUTE: |
297 | 3 | nAlgorithmType2 = SearchAlgorithms2::ABSOLUTE; |
298 | 3 | break; |
299 | 3 | } |
300 | | // It would be nice if an inherited struct had a ctor that takes an |
301 | | // instance of the object the struct derived from... |
302 | 3 | SearchOptions2 aOptions2( |
303 | 3 | rOptions.algorithmType, |
304 | 3 | rOptions.searchFlag, |
305 | 3 | rOptions.searchString, |
306 | 3 | rOptions.replaceString, |
307 | 3 | rOptions.Locale, |
308 | 3 | rOptions.changedChars, |
309 | 3 | rOptions.deletedChars, |
310 | 3 | rOptions.insertedChars, |
311 | 3 | rOptions.transliterateFlags, |
312 | 3 | nAlgorithmType2, |
313 | 3 | 0 // no wildcard search, no escape character... |
314 | 3 | ); |
315 | 3 | setOptions2( aOptions2); |
316 | 3 | } |
317 | | |
318 | | static sal_Int32 FindPosInSeq_Impl( const Sequence <sal_Int32>& rOff, sal_Int32 nPos ) |
319 | 0 | { |
320 | 0 | auto pOff = std::find_if(rOff.begin(), rOff.end(), |
321 | 0 | [nPos](const sal_Int32 nOff) { return nOff >= nPos; }); |
322 | 0 | return static_cast<sal_Int32>(std::distance(rOff.begin(), pOff)); |
323 | 0 | } |
324 | | |
325 | | SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) |
326 | 1.31k | { |
327 | 1.31k | std::unique_lock g(m_aMutex); |
328 | | |
329 | 1.31k | SearchResult sres; |
330 | | |
331 | 1.31k | OUString in_str(searchStr); |
332 | | |
333 | | // in non-regex mode, allow searching typographical apostrophe with the ASCII one |
334 | | // to avoid regression after using automatic conversion to U+2019 during typing in Writer |
335 | 1.31k | bool bReplaceApostrophe = bSearchApostrophe && isReplacePunctuation(in_str); |
336 | | |
337 | 1.31k | bUsePrimarySrchStr = true; |
338 | | |
339 | 1.31k | if ( xTranslit.is() ) |
340 | 1.31k | { |
341 | | // apply normal transliteration (1<->1, 1<->0) |
342 | | |
343 | 1.31k | sal_Int32 nInStartPos = startPos; |
344 | 1.31k | if (pRegexMatcher && startPos > 0) |
345 | 0 | { |
346 | | // tdf#89665, tdf#75806: An optimization to avoid transliterating the whole string, yet |
347 | | // transliterate enough of the leading text to allow sensible look-behind assertions. |
348 | | // 100 is chosen arbitrarily in the hope that look-behind assertions would largely fit. |
349 | | // See http://userguide.icu-project.org/strings/regexp for look-behind assertion syntax. |
350 | | // When search regex doesn't start with an assertion, 3 is to allow startPos to be in |
351 | | // the middle of a surrogate pair, preceded by another surrogate pair. |
352 | 0 | const sal_Int32 nMaxLeadingLen = aSrchPara.searchString.startsWith("(?") ? 100 : 3; |
353 | 0 | nInStartPos -= std::min(nMaxLeadingLen, startPos); |
354 | 0 | } |
355 | 1.31k | sal_Int32 nInEndPos = endPos; |
356 | 1.31k | if (pRegexMatcher && endPos < searchStr.getLength()) |
357 | 0 | { |
358 | | // tdf#65038: ditto for look-ahead assertions |
359 | 0 | const sal_Int32 nMaxTrailingLen = aSrchPara.searchString.endsWith(")") ? 100 : 3; |
360 | 0 | nInEndPos += std::min(nMaxTrailingLen, searchStr.getLength() - endPos); |
361 | 0 | } |
362 | | |
363 | 1.31k | css::uno::Sequence<sal_Int32> offset(nInEndPos - nInStartPos); |
364 | 1.31k | in_str = xTranslit->transliterate(searchStr, nInStartPos, nInEndPos - nInStartPos, offset); |
365 | | |
366 | 1.31k | if ( bReplaceApostrophe ) |
367 | 0 | in_str = replacePunctuation(in_str); |
368 | | |
369 | | // JP 20.6.2001: also the start and end positions must be corrected! |
370 | 1.31k | sal_Int32 newStartPos = |
371 | 1.31k | (startPos == 0) ? 0 : FindPosInSeq_Impl( offset, startPos ); |
372 | | |
373 | 1.31k | sal_Int32 newEndPos = (endPos < searchStr.getLength()) |
374 | 1.31k | ? FindPosInSeq_Impl( offset, endPos ) |
375 | 1.31k | : in_str.getLength(); |
376 | | |
377 | 1.31k | sres = (this->*fnForward)( g, in_str, newStartPos, newEndPos ); |
378 | | |
379 | | // Map offsets back to untransliterated string. |
380 | 1.31k | const sal_Int32 nOffsets = offset.getLength(); |
381 | 1.31k | if (nOffsets) |
382 | 1.31k | { |
383 | 1.31k | auto sres_startOffsetRange = asNonConstRange(sres.startOffset); |
384 | 1.31k | auto sres_endOffsetRange = asNonConstRange(sres.endOffset); |
385 | | // For regex nGroups is the number of groups+1 with group 0 being |
386 | | // the entire match. |
387 | 1.31k | const sal_Int32 nGroups = sres.startOffset.getLength(); |
388 | 1.50k | for ( sal_Int32 k = 0; k < nGroups; k++ ) |
389 | 185 | { |
390 | 185 | const sal_Int32 nStart = sres.startOffset[k]; |
391 | | // Result offsets are negative (-1) if a group expression was |
392 | | // not matched. |
393 | 185 | if (nStart >= 0) |
394 | 185 | sres_startOffsetRange[k] = (nStart < nOffsets ? offset[nStart] : (offset[nOffsets - 1] + 1)); |
395 | | // JP 20.6.2001: end is ever exclusive and then don't return |
396 | | // the position of the next character - return the |
397 | | // next position behind the last found character! |
398 | | // "a b c" find "b" must return 2,3 and not 2,4!!! |
399 | 185 | const sal_Int32 nStop = sres.endOffset[k]; |
400 | 185 | if (nStop >= 0) |
401 | 185 | { |
402 | 185 | if (nStop > 0) |
403 | 185 | sres_endOffsetRange[k] = offset[(nStop <= nOffsets ? nStop : nOffsets) - 1] + 1; |
404 | 0 | else |
405 | 0 | sres_endOffsetRange[k] = offset[0]; |
406 | 185 | } |
407 | 185 | } |
408 | 1.31k | } |
409 | 1.31k | } |
410 | 0 | else |
411 | 0 | { |
412 | 0 | if ( bReplaceApostrophe ) |
413 | 0 | in_str = in_str.replace(u'\u2019', '\''); |
414 | |
|
415 | 0 | sres = (this->*fnForward)( g, in_str, startPos, endPos ); |
416 | 0 | } |
417 | | |
418 | 1.31k | if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP) |
419 | 0 | { |
420 | 0 | SearchResult sres2; |
421 | |
|
422 | 0 | in_str = searchStr; |
423 | 0 | css::uno::Sequence <sal_Int32> offset( in_str.getLength()); |
424 | |
|
425 | 0 | in_str = xTranslit2->transliterate( searchStr, 0, in_str.getLength(), offset ); |
426 | |
|
427 | 0 | if( startPos ) |
428 | 0 | startPos = FindPosInSeq_Impl( offset, startPos ); |
429 | |
|
430 | 0 | if( endPos < searchStr.getLength() ) |
431 | 0 | endPos = FindPosInSeq_Impl( offset, endPos ); |
432 | 0 | else |
433 | 0 | endPos = in_str.getLength(); |
434 | |
|
435 | 0 | bUsePrimarySrchStr = false; |
436 | 0 | sres2 = (this->*fnForward)( g, in_str, startPos, endPos ); |
437 | 0 | auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset); |
438 | 0 | auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset); |
439 | |
|
440 | 0 | for ( int k = 0; k < sres2.startOffset.getLength(); k++ ) |
441 | 0 | { |
442 | 0 | if (sres2.startOffset[k]) |
443 | 0 | sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1] + 1; |
444 | 0 | if (sres2.endOffset[k]) |
445 | 0 | sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1] + 1; |
446 | 0 | } |
447 | | |
448 | | // pick first and long one |
449 | 0 | if ( sres.subRegExpressions == 0) |
450 | 0 | return sres2; |
451 | 0 | if ( sres2.subRegExpressions == 1) |
452 | 0 | { |
453 | 0 | if ( sres.startOffset[0] > sres2.startOffset[0]) |
454 | 0 | return sres2; |
455 | 0 | else if ( sres.startOffset[0] == sres2.startOffset[0] && |
456 | 0 | sres.endOffset[0] < sres2.endOffset[0]) |
457 | 0 | return sres2; |
458 | 0 | } |
459 | 0 | } |
460 | | |
461 | 1.31k | return sres; |
462 | 1.31k | } |
463 | | |
464 | | SearchResult TextSearch::searchBackward( const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) |
465 | 0 | { |
466 | 0 | std::unique_lock g(m_aMutex); |
467 | |
|
468 | 0 | SearchResult sres; |
469 | |
|
470 | 0 | OUString in_str(searchStr); |
471 | | |
472 | | // in non-regex mode, allow searching typographical apostrophe with the ASCII one |
473 | | // to avoid regression after using automatic conversion to U+2019 during typing in Writer |
474 | 0 | bool bReplaceApostrophe = bSearchApostrophe && isReplacePunctuation(in_str); |
475 | |
|
476 | 0 | bUsePrimarySrchStr = true; |
477 | |
|
478 | 0 | if ( xTranslit.is() ) |
479 | 0 | { |
480 | | // apply only simple 1<->1 transliteration here |
481 | 0 | css::uno::Sequence<sal_Int32> offset(startPos - endPos); |
482 | 0 | in_str = xTranslit->transliterate( searchStr, endPos, startPos - endPos, offset ); |
483 | |
|
484 | 0 | if ( bReplaceApostrophe ) |
485 | 0 | in_str = replacePunctuation(in_str); |
486 | | |
487 | | // JP 20.6.2001: also the start and end positions must be corrected! |
488 | 0 | sal_Int32 const newStartPos = (startPos < searchStr.getLength()) |
489 | 0 | ? FindPosInSeq_Impl( offset, startPos ) |
490 | 0 | : in_str.getLength(); |
491 | |
|
492 | 0 | sal_Int32 const newEndPos = |
493 | 0 | (endPos == 0) ? 0 : FindPosInSeq_Impl( offset, endPos ); |
494 | | |
495 | | // TODO: this would need nExtraOffset handling to avoid $ matching |
496 | | // if (pRegexMatcher && startPos < searchStr.getLength()) |
497 | | // but that appears to be impossible with ICU regex |
498 | |
|
499 | 0 | sres = (this->*fnBackward)( g, in_str, newStartPos, newEndPos ); |
500 | | |
501 | | // Map offsets back to untransliterated string. |
502 | 0 | const sal_Int32 nOffsets = offset.getLength(); |
503 | 0 | if (nOffsets) |
504 | 0 | { |
505 | 0 | auto sres_startOffsetRange = asNonConstRange(sres.startOffset); |
506 | 0 | auto sres_endOffsetRange = asNonConstRange(sres.endOffset); |
507 | | // For regex nGroups is the number of groups+1 with group 0 being |
508 | | // the entire match. |
509 | 0 | const sal_Int32 nGroups = sres.startOffset.getLength(); |
510 | 0 | for ( sal_Int32 k = 0; k < nGroups; k++ ) |
511 | 0 | { |
512 | 0 | const sal_Int32 nStart = sres.startOffset[k]; |
513 | | // Result offsets are negative (-1) if a group expression was |
514 | | // not matched. |
515 | 0 | if (nStart >= 0) |
516 | 0 | { |
517 | 0 | if (nStart > 0) |
518 | 0 | sres_startOffsetRange[k] = offset[(nStart <= nOffsets ? nStart : nOffsets) - 1] + 1; |
519 | 0 | else |
520 | 0 | sres_startOffsetRange[k] = offset[0]; |
521 | 0 | } |
522 | | // JP 20.6.2001: end is ever exclusive and then don't return |
523 | | // the position of the next character - return the |
524 | | // next position behind the last found character! |
525 | | // "a b c" find "b" must return 2,3 and not 2,4!!! |
526 | 0 | const sal_Int32 nStop = sres.endOffset[k]; |
527 | 0 | if (nStop >= 0) |
528 | 0 | sres_endOffsetRange[k] = (nStop < nOffsets ? offset[nStop] : (offset[nOffsets - 1] + 1)); |
529 | 0 | } |
530 | 0 | } |
531 | 0 | } |
532 | 0 | else |
533 | 0 | { |
534 | 0 | if ( bReplaceApostrophe ) |
535 | 0 | in_str = replacePunctuation(in_str); |
536 | |
|
537 | 0 | sres = (this->*fnBackward)( g, in_str, startPos, endPos ); |
538 | 0 | } |
539 | |
|
540 | 0 | if ( xTranslit2.is() && aSrchPara.AlgorithmType2 != SearchAlgorithms2::REGEXP ) |
541 | 0 | { |
542 | 0 | SearchResult sres2; |
543 | |
|
544 | 0 | in_str = searchStr; |
545 | 0 | css::uno::Sequence <sal_Int32> offset( in_str.getLength()); |
546 | |
|
547 | 0 | in_str = xTranslit2->transliterate(searchStr, 0, in_str.getLength(), offset); |
548 | |
|
549 | 0 | if( startPos < searchStr.getLength() ) |
550 | 0 | startPos = FindPosInSeq_Impl( offset, startPos ); |
551 | 0 | else |
552 | 0 | startPos = in_str.getLength(); |
553 | |
|
554 | 0 | if( endPos ) |
555 | 0 | endPos = FindPosInSeq_Impl( offset, endPos ); |
556 | |
|
557 | 0 | bUsePrimarySrchStr = false; |
558 | 0 | sres2 = (this->*fnBackward)( g, in_str, startPos, endPos ); |
559 | 0 | auto sres2_startOffsetRange = asNonConstRange(sres2.startOffset); |
560 | 0 | auto sres2_endOffsetRange = asNonConstRange(sres2.endOffset); |
561 | |
|
562 | 0 | for( int k = 0; k < sres2.startOffset.getLength(); k++ ) |
563 | 0 | { |
564 | 0 | if (sres2.startOffset[k]) |
565 | 0 | sres2_startOffsetRange[k] = offset[sres2.startOffset[k]-1]+1; |
566 | 0 | if (sres2.endOffset[k]) |
567 | 0 | sres2_endOffsetRange[k] = offset[sres2.endOffset[k]-1]+1; |
568 | 0 | } |
569 | | |
570 | | // pick last and long one |
571 | 0 | if ( sres.subRegExpressions == 0 ) |
572 | 0 | return sres2; |
573 | 0 | if ( sres2.subRegExpressions == 1 ) |
574 | 0 | { |
575 | 0 | if ( sres.startOffset[0] < sres2.startOffset[0] ) |
576 | 0 | return sres2; |
577 | 0 | if ( sres.startOffset[0] == sres2.startOffset[0] && |
578 | 0 | sres.endOffset[0] > sres2.endOffset[0] ) |
579 | 0 | return sres2; |
580 | 0 | } |
581 | 0 | } |
582 | | |
583 | 0 | return sres; |
584 | 0 | } |
585 | | |
586 | | |
587 | | bool TextSearch::IsDelimiter( const OUString& rStr, sal_Int32 nPos ) const |
588 | 0 | { |
589 | 0 | bool bRet = true; |
590 | 0 | if( '\x7f' != rStr[nPos]) |
591 | 0 | { |
592 | 0 | if ( !xCharClass.is() ) |
593 | 0 | xCharClass = CharacterClassification::create( m_xContext ); |
594 | 0 | sal_Int32 nCType = xCharClass->getCharacterType( rStr, nPos, |
595 | 0 | aSrchPara.Locale ); |
596 | 0 | if( 0 != (( KCharacterType::DIGIT | KCharacterType::ALPHA | |
597 | 0 | KCharacterType::LETTER ) & nCType ) ) |
598 | 0 | bRet = false; |
599 | 0 | } |
600 | 0 | return bRet; |
601 | 0 | } |
602 | | |
603 | | // --------- helper methods for Boyer-Moore like text searching ---------- |
604 | | // TODO: use ICU's regex UREGEX_LITERAL mode instead when it becomes available |
605 | | |
606 | | void TextSearch::MakeForwardTab() |
607 | 24 | { |
608 | | // create the jumptable for the search text |
609 | | |
610 | 24 | if( pJumpTable && bIsForwardTab ) |
611 | 23 | { |
612 | 23 | return; // the jumpTable is ok |
613 | 23 | } |
614 | 1 | bIsForwardTab = true; |
615 | | |
616 | 1 | sal_Int32 n, nLen = sSrchStr.getLength(); |
617 | 1 | pJumpTable.reset( new TextSearchJumpTable ); |
618 | | |
619 | 9 | for( n = 0; n < nLen - 1; ++n ) |
620 | 8 | { |
621 | 8 | sal_Unicode cCh = sSrchStr[n]; |
622 | 8 | sal_Int32 nDiff = nLen - n - 1; |
623 | 8 | TextSearchJumpTable::value_type aEntry( cCh, nDiff ); |
624 | | |
625 | 8 | ::std::pair< TextSearchJumpTable::iterator, bool > aPair = |
626 | 8 | pJumpTable->insert( aEntry ); |
627 | 8 | if ( !aPair.second ) |
628 | 1 | (*(aPair.first)).second = nDiff; |
629 | 8 | } |
630 | 1 | } |
631 | | |
632 | | void TextSearch::MakeForwardTab2() |
633 | 0 | { |
634 | | // create the jumptable for the search text |
635 | 0 | if( pJumpTable2 && bIsForwardTab ) |
636 | 0 | { |
637 | 0 | return; // the jumpTable is ok |
638 | 0 | } |
639 | 0 | bIsForwardTab = true; |
640 | |
|
641 | 0 | sal_Int32 n, nLen = sSrchStr2.getLength(); |
642 | 0 | pJumpTable2.reset( new TextSearchJumpTable ); |
643 | |
|
644 | 0 | for( n = 0; n < nLen - 1; ++n ) |
645 | 0 | { |
646 | 0 | sal_Unicode cCh = sSrchStr2[n]; |
647 | 0 | sal_Int32 nDiff = nLen - n - 1; |
648 | |
|
649 | 0 | TextSearchJumpTable::value_type aEntry( cCh, nDiff ); |
650 | 0 | ::std::pair< TextSearchJumpTable::iterator, bool > aPair = |
651 | 0 | pJumpTable2->insert( aEntry ); |
652 | 0 | if ( !aPair.second ) |
653 | 0 | (*(aPair.first)).second = nDiff; |
654 | 0 | } |
655 | 0 | } |
656 | | |
657 | | void TextSearch::MakeBackwardTab() |
658 | 0 | { |
659 | | // create the jumptable for the search text |
660 | 0 | if( pJumpTable && !bIsForwardTab) |
661 | 0 | { |
662 | 0 | return; // the jumpTable is ok |
663 | 0 | } |
664 | 0 | bIsForwardTab = false; |
665 | |
|
666 | 0 | sal_Int32 n, nLen = sSrchStr.getLength(); |
667 | 0 | pJumpTable.reset( new TextSearchJumpTable ); |
668 | |
|
669 | 0 | for( n = nLen-1; n > 0; --n ) |
670 | 0 | { |
671 | 0 | sal_Unicode cCh = sSrchStr[n]; |
672 | 0 | TextSearchJumpTable::value_type aEntry( cCh, n ); |
673 | 0 | ::std::pair< TextSearchJumpTable::iterator, bool > aPair = |
674 | 0 | pJumpTable->insert( aEntry ); |
675 | 0 | if ( !aPair.second ) |
676 | 0 | (*(aPair.first)).second = n; |
677 | 0 | } |
678 | 0 | } |
679 | | |
680 | | void TextSearch::MakeBackwardTab2() |
681 | 0 | { |
682 | | // create the jumptable for the search text |
683 | 0 | if( pJumpTable2 && !bIsForwardTab ) |
684 | 0 | { |
685 | 0 | return; // the jumpTable is ok |
686 | 0 | } |
687 | 0 | bIsForwardTab = false; |
688 | |
|
689 | 0 | sal_Int32 n, nLen = sSrchStr2.getLength(); |
690 | 0 | pJumpTable2.reset( new TextSearchJumpTable ); |
691 | |
|
692 | 0 | for( n = nLen-1; n > 0; --n ) |
693 | 0 | { |
694 | 0 | sal_Unicode cCh = sSrchStr2[n]; |
695 | 0 | TextSearchJumpTable::value_type aEntry( cCh, n ); |
696 | 0 | ::std::pair< TextSearchJumpTable::iterator, bool > aPair = |
697 | 0 | pJumpTable2->insert( aEntry ); |
698 | 0 | if ( !aPair.second ) |
699 | 0 | (*(aPair.first)).second = n; |
700 | 0 | } |
701 | 0 | } |
702 | | |
703 | | sal_Int32 TextSearch::GetDiff( const sal_Unicode cChr ) const |
704 | 165 | { |
705 | 165 | TextSearchJumpTable *pJump; |
706 | 165 | OUString sSearchKey; |
707 | | |
708 | 165 | if ( bUsePrimarySrchStr ) { |
709 | 165 | pJump = pJumpTable.get(); |
710 | 165 | sSearchKey = sSrchStr; |
711 | 165 | } else { |
712 | 0 | pJump = pJumpTable2.get(); |
713 | 0 | sSearchKey = sSrchStr2; |
714 | 0 | } |
715 | | |
716 | 165 | TextSearchJumpTable::const_iterator iLook = pJump->find( cChr ); |
717 | 165 | if ( iLook == pJump->end() ) |
718 | 120 | return sSearchKey.getLength(); |
719 | 45 | return (*iLook).second; |
720 | 165 | } |
721 | | |
722 | | |
723 | | SearchResult TextSearch::NSrchFrwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) |
724 | 24 | { |
725 | 24 | SearchResult aRet; |
726 | 24 | aRet.subRegExpressions = 0; |
727 | | |
728 | 24 | OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; |
729 | | |
730 | 24 | sal_Int32 nSuchIdx = searchStr.getLength(); |
731 | 24 | sal_Int32 nEnd = endPos; |
732 | 24 | if( !nSuchIdx || !sSearchKey.getLength() || sSearchKey.getLength() > nSuchIdx ) |
733 | 0 | return aRet; |
734 | | |
735 | | |
736 | 24 | if( nEnd < sSearchKey.getLength() ) // position inside the search region ? |
737 | 0 | return aRet; |
738 | | |
739 | 24 | nEnd -= sSearchKey.getLength(); |
740 | | |
741 | 24 | if (bUsePrimarySrchStr) |
742 | 24 | MakeForwardTab(); // create the jumptable |
743 | 0 | else |
744 | 0 | MakeForwardTab2(); |
745 | | |
746 | 24 | for (sal_Int32 nCmpIdx = startPos; // start position for the search |
747 | 189 | nCmpIdx <= nEnd; |
748 | 165 | nCmpIdx += GetDiff( searchStr[nCmpIdx + sSearchKey.getLength()-1])) |
749 | 180 | { |
750 | 180 | nSuchIdx = sSearchKey.getLength() - 1; |
751 | 315 | while( nSuchIdx >= 0 && sSearchKey[nSuchIdx] == searchStr[nCmpIdx + nSuchIdx]) |
752 | 150 | { |
753 | 150 | if( nSuchIdx == 0 ) |
754 | 15 | { |
755 | 15 | if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) |
756 | 0 | { |
757 | 0 | sal_Int32 nFndEnd = nCmpIdx + sSearchKey.getLength(); |
758 | 0 | bool bAtStart = !nCmpIdx; |
759 | 0 | bool bAtEnd = nFndEnd == endPos; |
760 | 0 | bool bDelimBefore = bAtStart || IsDelimiter( searchStr, nCmpIdx-1 ); |
761 | 0 | bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nFndEnd ); |
762 | | // * 1 -> only one word in the paragraph |
763 | | // * 2 -> at begin of paragraph |
764 | | // * 3 -> at end of paragraph |
765 | | // * 4 -> inside the paragraph |
766 | 0 | if( !( ( bAtStart && bAtEnd ) || // 1 |
767 | 0 | ( bAtStart && bDelimBehind ) || // 2 |
768 | 0 | ( bAtEnd && bDelimBefore ) || // 3 |
769 | 0 | ( bDelimBefore && bDelimBehind ))) // 4 |
770 | 0 | break; |
771 | 0 | } |
772 | | |
773 | 15 | aRet.subRegExpressions = 1; |
774 | 15 | aRet.startOffset = { nCmpIdx }; |
775 | 15 | aRet.endOffset = { nCmpIdx + sSearchKey.getLength() }; |
776 | | |
777 | 15 | return aRet; |
778 | 15 | } |
779 | 135 | else |
780 | 135 | nSuchIdx--; |
781 | 150 | } |
782 | 180 | } |
783 | 9 | return aRet; |
784 | 24 | } |
785 | | |
786 | | SearchResult TextSearch::NSrchBkwrd( std::unique_lock<std::mutex>& /*rGuard*/,const OUString& searchStr, sal_Int32 startPos, sal_Int32 endPos ) |
787 | 0 | { |
788 | 0 | SearchResult aRet; |
789 | 0 | aRet.subRegExpressions = 0; |
790 | |
|
791 | 0 | OUString sSearchKey = bUsePrimarySrchStr ? sSrchStr : sSrchStr2; |
792 | |
|
793 | 0 | sal_Int32 nSuchIdx = searchStr.getLength(); |
794 | 0 | sal_Int32 nEnd = endPos; |
795 | 0 | if( nSuchIdx == 0 || sSearchKey.isEmpty() || sSearchKey.getLength() > nSuchIdx) |
796 | 0 | return aRet; |
797 | | |
798 | 0 | if (bUsePrimarySrchStr) |
799 | 0 | MakeBackwardTab(); // create the jumptable |
800 | 0 | else |
801 | 0 | MakeBackwardTab2(); |
802 | |
|
803 | 0 | if( nEnd == nSuchIdx ) // end position for the search |
804 | 0 | nEnd = sSearchKey.getLength(); |
805 | 0 | else |
806 | 0 | nEnd += sSearchKey.getLength(); |
807 | |
|
808 | 0 | sal_Int32 nCmpIdx = startPos; // start position for the search |
809 | |
|
810 | 0 | while (nCmpIdx >= nEnd) |
811 | 0 | { |
812 | 0 | nSuchIdx = 0; |
813 | 0 | while( nSuchIdx < sSearchKey.getLength() && sSearchKey[nSuchIdx] == |
814 | 0 | searchStr[nCmpIdx + nSuchIdx - sSearchKey.getLength()] ) |
815 | 0 | nSuchIdx++; |
816 | 0 | if( nSuchIdx >= sSearchKey.getLength() ) |
817 | 0 | { |
818 | 0 | if( SearchFlags::NORM_WORD_ONLY & aSrchPara.searchFlag ) |
819 | 0 | { |
820 | 0 | sal_Int32 nFndStt = nCmpIdx - sSearchKey.getLength(); |
821 | 0 | bool bAtStart = !nFndStt; |
822 | 0 | bool bAtEnd = nCmpIdx == startPos; |
823 | 0 | bool bDelimBehind = bAtEnd || IsDelimiter( searchStr, nCmpIdx ); |
824 | 0 | bool bDelimBefore = bAtStart || // begin of paragraph |
825 | 0 | IsDelimiter( searchStr, nFndStt-1 ); |
826 | | // * 1 -> only one word in the paragraph |
827 | | // * 2 -> at begin of paragraph |
828 | | // * 3 -> at end of paragraph |
829 | | // * 4 -> inside the paragraph |
830 | 0 | if( ( bAtStart && bAtEnd ) || // 1 |
831 | 0 | ( bAtStart && bDelimBehind ) || // 2 |
832 | 0 | ( bAtEnd && bDelimBefore ) || // 3 |
833 | 0 | ( bDelimBefore && bDelimBehind )) // 4 |
834 | 0 | { |
835 | 0 | aRet.subRegExpressions = 1; |
836 | 0 | aRet.startOffset = { nCmpIdx }; |
837 | 0 | aRet.endOffset = { nCmpIdx - sSearchKey.getLength() }; |
838 | 0 | return aRet; |
839 | 0 | } |
840 | 0 | } |
841 | 0 | else |
842 | 0 | { |
843 | 0 | aRet.subRegExpressions = 1; |
844 | 0 | aRet.startOffset = { nCmpIdx }; |
845 | 0 | aRet.endOffset = { nCmpIdx - sSearchKey.getLength() }; |
846 | 0 | return aRet; |
847 | 0 | } |
848 | 0 | } |
849 | 0 | nSuchIdx = GetDiff( searchStr[nCmpIdx - sSearchKey.getLength()] ); |
850 | 0 | if( nCmpIdx < nSuchIdx ) |
851 | 0 | return aRet; |
852 | 0 | nCmpIdx -= nSuchIdx; |
853 | 0 | } |
854 | 0 | return aRet; |
855 | 0 | } |
856 | | |
857 | | void TextSearch::RESrchPrepare( const css::util::SearchOptions2& rOptions) |
858 | 0 | { |
859 | 0 | TransliterationFlags transliterateFlags = static_cast<TransliterationFlags>(rOptions.transliterateFlags); |
860 | | // select the transliterated pattern string |
861 | 0 | const OUString& rPatternStr = |
862 | 0 | (isSimpleTrans(transliterateFlags) ? sSrchStr |
863 | 0 | : (isComplexTrans(transliterateFlags) ? sSrchStr2 : rOptions.searchString)); |
864 | |
|
865 | 0 | sal_uInt32 nIcuSearchFlags = UREGEX_UWORD; // request UAX#29 unicode capability |
866 | | // map css::util::SearchFlags to ICU uregex.h flags |
867 | | // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE |
868 | | // REG_NEWLINE is neither properly defined nor used anywhere => not implemented |
869 | | // REG_NOSUB is not used anywhere => not implemented |
870 | | // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute |
871 | | // LEV_RELAXED is only used for SearchAlgorithm==Approximate |
872 | | // Note that the search flag ALL_IGNORE_CASE is deprecated in UNO |
873 | | // probably because the transliteration flag IGNORE_CASE handles it as well. |
874 | 0 | if( (rOptions.searchFlag & css::util::SearchFlags::ALL_IGNORE_CASE) != 0 |
875 | 0 | || (transliterateFlags & TransliterationFlags::IGNORE_CASE)) |
876 | 0 | nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE; |
877 | 0 | UErrorCode nIcuErr = U_ZERO_ERROR; |
878 | | // assumption: transliteration didn't mangle regexp control chars |
879 | 0 | icu::UnicodeString aIcuSearchPatStr( reinterpret_cast<const UChar*>(rPatternStr.getStr()), rPatternStr.getLength()); |
880 | 0 | #ifndef DISABLE_WORDBOUND_EMULATION |
881 | | // for convenience specific syntax elements of the old regex engine are emulated |
882 | | // - by replacing \< with "word-break followed by a look-ahead word-char" |
883 | 0 | static const icu::UnicodeString aChevronPatternB( "\\\\<", -1, icu::UnicodeString::kInvariant); |
884 | 0 | static const icu::UnicodeString aChevronReplaceB( "\\\\b(?=\\\\w)", -1, icu::UnicodeString::kInvariant); |
885 | 0 | static icu::RegexMatcher aChevronMatcherB( aChevronPatternB, 0, nIcuErr); |
886 | 0 | aChevronMatcherB.reset( aIcuSearchPatStr); |
887 | 0 | aIcuSearchPatStr = aChevronMatcherB.replaceAll( aChevronReplaceB, nIcuErr); |
888 | 0 | aChevronMatcherB.reset(); |
889 | | // - by replacing \> with "look-behind word-char followed by a word-break" |
890 | 0 | static const icu::UnicodeString aChevronPatternE( "\\\\>", -1, icu::UnicodeString::kInvariant); |
891 | 0 | static const icu::UnicodeString aChevronReplaceE( "(?<=\\\\w)\\\\b", -1, icu::UnicodeString::kInvariant); |
892 | 0 | static icu::RegexMatcher aChevronMatcherE( aChevronPatternE, 0, nIcuErr); |
893 | 0 | aChevronMatcherE.reset( aIcuSearchPatStr); |
894 | 0 | aIcuSearchPatStr = aChevronMatcherE.replaceAll( aChevronReplaceE, nIcuErr); |
895 | 0 | aChevronMatcherE.reset(); |
896 | 0 | #endif |
897 | 0 | pRegexMatcher.reset( new icu::RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr) ); |
898 | 0 | if (nIcuErr) |
899 | 0 | { |
900 | 0 | SAL_INFO( "i18npool", "TextSearch::RESrchPrepare UErrorCode " << nIcuErr); |
901 | 0 | pRegexMatcher.reset(); |
902 | 0 | } |
903 | 0 | else |
904 | 0 | { |
905 | | // Pathological patterns may result in exponential run time making the |
906 | | // application appear to be frozen. Limit that. Documentation for this |
907 | | // call says |
908 | | // https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1RegexMatcher.html#a6ebcfcab4fe6a38678c0291643a03a00 |
909 | | // "The units of the limit are steps of the match engine. |
910 | | // Correspondence with actual processor time will depend on the speed |
911 | | // of the processor and the details of the specific pattern, but will |
912 | | // typically be on the order of milliseconds." |
913 | | // Just what is a good value? 42 is always an answer ... the 23 enigma |
914 | | // as well... which on the dev's machine is roughly 50 seconds with the |
915 | | // pattern of fdo#70627. |
916 | | /* TODO: make this a configuration settable value and possibly take |
917 | | * complexity of expression into account and maybe even length of text |
918 | | * to be matched; currently (2013-11-25) that is at most one 64k |
919 | | * paragraph per RESrchFrwrd()/RESrchBkwrd() call. */ |
920 | 0 | pRegexMatcher->setTimeLimit( 23*1000, nIcuErr); |
921 | 0 | } |
922 | 0 | } |
923 | | |
924 | | |
925 | | static bool lcl_findRegex(std::unique_ptr<icu::RegexMatcher> const& pRegexMatcher, |
926 | | sal_Int32 nStartPos, sal_Int32 nEndPos, UErrorCode& rIcuErr) |
927 | 0 | { |
928 | 0 | pRegexMatcher->region(nStartPos, nEndPos, rIcuErr); |
929 | 0 | pRegexMatcher->useAnchoringBounds(false); // use whole text's anchoring bounds, not region's |
930 | 0 | pRegexMatcher->useTransparentBounds(true); // take text outside of the region into account for |
931 | | // look-ahead/behind assertions |
932 | |
|
933 | 0 | if (!pRegexMatcher->find(rIcuErr)) |
934 | 0 | { |
935 | | /* TODO: future versions could pass the UErrorCode or translations |
936 | | * thereof to the caller, for example to inform the user of |
937 | | * U_REGEX_TIME_OUT. The strange thing though is that an error is set |
938 | | * only after the second call that returns immediately and not if |
939 | | * timeout occurred on the first call?!? */ |
940 | 0 | SAL_INFO( "i18npool", "lcl_findRegex UErrorCode " << rIcuErr); |
941 | 0 | return false; |
942 | 0 | } |
943 | 0 | return true; |
944 | 0 | } |
945 | | |
946 | | SearchResult TextSearch::RESrchFrwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr, |
947 | | sal_Int32 startPos, sal_Int32 endPos ) |
948 | 0 | { |
949 | 0 | SearchResult aRet; |
950 | 0 | aRet.subRegExpressions = 0; |
951 | 0 | if( !pRegexMatcher) |
952 | 0 | return aRet; |
953 | | |
954 | 0 | if( endPos > searchStr.getLength()) |
955 | 0 | endPos = searchStr.getLength(); |
956 | | |
957 | | // use the ICU RegexMatcher to find the matches |
958 | 0 | UErrorCode nIcuErr = U_ZERO_ERROR; |
959 | 0 | const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()), |
960 | 0 | searchStr.getLength()); |
961 | 0 | pRegexMatcher->reset( aSearchTargetStr); |
962 | | // search until there is a valid match |
963 | 0 | for(;;) |
964 | 0 | { |
965 | 0 | if (!lcl_findRegex( pRegexMatcher, startPos, endPos, nIcuErr)) |
966 | 0 | return aRet; |
967 | | |
968 | | // #i118887# ignore zero-length matches e.g. "a*" in "bc" |
969 | 0 | int nStartOfs = pRegexMatcher->start( nIcuErr); |
970 | 0 | int nEndOfs = pRegexMatcher->end( nIcuErr); |
971 | 0 | if( nStartOfs < nEndOfs) |
972 | 0 | break; |
973 | | // If the zero-length match is behind the string, do not match it again |
974 | | // and again until startPos reaches there. A match behind the string is |
975 | | // a "$" anchor. |
976 | 0 | if (nStartOfs == endPos) |
977 | 0 | break; |
978 | | // try at next position if there was a zero-length match |
979 | 0 | if( ++startPos >= endPos) |
980 | 0 | return aRet; |
981 | 0 | } |
982 | | |
983 | | // extract the result of the search |
984 | 0 | const int nGroupCount = pRegexMatcher->groupCount(); |
985 | 0 | aRet.subRegExpressions = nGroupCount + 1; |
986 | 0 | aRet.startOffset.realloc( aRet.subRegExpressions); |
987 | 0 | auto pstartOffset = aRet.startOffset.getArray(); |
988 | 0 | aRet.endOffset.realloc( aRet.subRegExpressions); |
989 | 0 | auto pendOffset = aRet.endOffset.getArray(); |
990 | 0 | pstartOffset[0] = pRegexMatcher->start( nIcuErr); |
991 | 0 | pendOffset[0] = pRegexMatcher->end( nIcuErr); |
992 | 0 | for( int i = 1; i <= nGroupCount; ++i) { |
993 | 0 | pstartOffset[i] = pRegexMatcher->start( i, nIcuErr); |
994 | 0 | pendOffset[i] = pRegexMatcher->end( i, nIcuErr); |
995 | 0 | } |
996 | |
|
997 | 0 | return aRet; |
998 | 0 | } |
999 | | |
1000 | | SearchResult TextSearch::RESrchBkwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr, |
1001 | | sal_Int32 startPos, sal_Int32 endPos ) |
1002 | 0 | { |
1003 | | // NOTE: for backwards search callers provide startPos/endPos inverted! |
1004 | 0 | SearchResult aRet; |
1005 | 0 | aRet.subRegExpressions = 0; |
1006 | 0 | if( !pRegexMatcher) |
1007 | 0 | return aRet; |
1008 | | |
1009 | 0 | if( startPos > searchStr.getLength()) |
1010 | 0 | startPos = searchStr.getLength(); |
1011 | | |
1012 | | // use the ICU RegexMatcher to find the matches |
1013 | | // TODO: use ICU's backward searching once it becomes available |
1014 | | // as its replacement using forward search is not as good as the real thing |
1015 | 0 | UErrorCode nIcuErr = U_ZERO_ERROR; |
1016 | 0 | const icu::UnicodeString aSearchTargetStr(false, reinterpret_cast<const UChar*>(searchStr.getStr()), |
1017 | 0 | searchStr.getLength()); |
1018 | 0 | pRegexMatcher->reset( aSearchTargetStr); |
1019 | 0 | if (!lcl_findRegex( pRegexMatcher, endPos, startPos, nIcuErr)) |
1020 | 0 | return aRet; |
1021 | | |
1022 | | // find the last match |
1023 | 0 | int nLastPos = 0; |
1024 | 0 | int nFoundEnd = 0; |
1025 | 0 | int nGoodPos = 0, nGoodEnd = 0; |
1026 | 0 | bool bFirst = true; |
1027 | 0 | do { |
1028 | 0 | nLastPos = pRegexMatcher->start( nIcuErr); |
1029 | 0 | nFoundEnd = pRegexMatcher->end( nIcuErr); |
1030 | 0 | if (nLastPos < nFoundEnd) |
1031 | 0 | { |
1032 | | // remember last non-zero-length match |
1033 | 0 | nGoodPos = nLastPos; |
1034 | 0 | nGoodEnd = nFoundEnd; |
1035 | 0 | } |
1036 | 0 | if( nFoundEnd >= startPos) |
1037 | 0 | break; |
1038 | 0 | bFirst = false; |
1039 | 0 | if( nFoundEnd == nLastPos) |
1040 | 0 | ++nFoundEnd; |
1041 | 0 | } while( lcl_findRegex( pRegexMatcher, nFoundEnd, startPos, nIcuErr)); |
1042 | | |
1043 | | // Ignore all zero-length matches except "$" anchor on first match. |
1044 | 0 | if (nGoodPos == nGoodEnd) |
1045 | 0 | { |
1046 | 0 | if (bFirst && nLastPos == startPos) |
1047 | 0 | nGoodPos = nLastPos; |
1048 | 0 | else |
1049 | 0 | return aRet; |
1050 | 0 | } |
1051 | | |
1052 | | // find last match again to get its details |
1053 | 0 | lcl_findRegex( pRegexMatcher, nGoodPos, startPos, nIcuErr); |
1054 | | |
1055 | | // fill in the details of the last match |
1056 | 0 | const int nGroupCount = pRegexMatcher->groupCount(); |
1057 | 0 | aRet.subRegExpressions = nGroupCount + 1; |
1058 | 0 | aRet.startOffset.realloc( aRet.subRegExpressions); |
1059 | 0 | auto pstartOffset = aRet.startOffset.getArray(); |
1060 | 0 | aRet.endOffset.realloc( aRet.subRegExpressions); |
1061 | 0 | auto pendOffset = aRet.endOffset.getArray(); |
1062 | | // NOTE: existing users of backward search seem to expect startOfs/endOfs being inverted! |
1063 | 0 | pstartOffset[0] = pRegexMatcher->end( nIcuErr); |
1064 | 0 | pendOffset[0] = pRegexMatcher->start( nIcuErr); |
1065 | 0 | for( int i = 1; i <= nGroupCount; ++i) { |
1066 | 0 | pstartOffset[i] = pRegexMatcher->end( i, nIcuErr); |
1067 | 0 | pendOffset[i] = pRegexMatcher->start( i, nIcuErr); |
1068 | 0 | } |
1069 | |
|
1070 | 0 | return aRet; |
1071 | 0 | } |
1072 | | |
1073 | | |
1074 | | // search for words phonetically |
1075 | | SearchResult TextSearch::ApproxSrchFrwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr, |
1076 | | sal_Int32 startPos, sal_Int32 endPos ) |
1077 | 0 | { |
1078 | 0 | SearchResult aRet; |
1079 | 0 | aRet.subRegExpressions = 0; |
1080 | |
|
1081 | 0 | if( !xBreak.is() ) |
1082 | 0 | return aRet; |
1083 | | |
1084 | 0 | sal_Int32 nStt, nEnd; |
1085 | |
|
1086 | 0 | Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos, |
1087 | 0 | aSrchPara.Locale, |
1088 | 0 | WordType::ANYWORD_IGNOREWHITESPACES, true ); |
1089 | |
|
1090 | 0 | do |
1091 | 0 | { |
1092 | 0 | if( aWBnd.startPos >= endPos ) |
1093 | 0 | break; |
1094 | 0 | nStt = aWBnd.startPos < startPos ? startPos : aWBnd.startPos; |
1095 | 0 | nEnd = std::min(aWBnd.endPos, endPos); |
1096 | |
|
1097 | 0 | if( nStt < nEnd && |
1098 | 0 | pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit ) |
1099 | 0 | { |
1100 | 0 | aRet.subRegExpressions = 1; |
1101 | 0 | aRet.startOffset = { nStt }; |
1102 | 0 | aRet.endOffset = { nEnd }; |
1103 | 0 | break; |
1104 | 0 | } |
1105 | | |
1106 | 0 | nStt = nEnd - 1; |
1107 | 0 | aWBnd = xBreak->nextWord( searchStr, nStt, aSrchPara.Locale, |
1108 | 0 | WordType::ANYWORD_IGNOREWHITESPACES); |
1109 | 0 | } while( aWBnd.startPos != aWBnd.endPos || |
1110 | 0 | (aWBnd.endPos != searchStr.getLength() && aWBnd.endPos != nEnd) ); |
1111 | | // #i50244# aWBnd.endPos != nEnd : in case there is _no_ word (only |
1112 | | // whitespace) in searchStr, getWordBoundary() returned startPos,startPos |
1113 | | // and nextWord() does also => don't loop forever. |
1114 | 0 | return aRet; |
1115 | 0 | } |
1116 | | |
1117 | | SearchResult TextSearch::ApproxSrchBkwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr, |
1118 | | sal_Int32 startPos, sal_Int32 endPos ) |
1119 | 0 | { |
1120 | 0 | SearchResult aRet; |
1121 | 0 | aRet.subRegExpressions = 0; |
1122 | |
|
1123 | 0 | if( !xBreak.is() ) |
1124 | 0 | return aRet; |
1125 | | |
1126 | 0 | sal_Int32 nStt, nEnd; |
1127 | |
|
1128 | 0 | Boundary aWBnd = xBreak->getWordBoundary( searchStr, startPos, |
1129 | 0 | aSrchPara.Locale, |
1130 | 0 | WordType::ANYWORD_IGNOREWHITESPACES, true ); |
1131 | |
|
1132 | 0 | do |
1133 | 0 | { |
1134 | 0 | if( aWBnd.endPos <= endPos ) |
1135 | 0 | break; |
1136 | 0 | nStt = aWBnd.startPos < endPos ? endPos : aWBnd.startPos; |
1137 | 0 | nEnd = std::min(aWBnd.endPos, startPos); |
1138 | |
|
1139 | 0 | if( nStt < nEnd && |
1140 | 0 | pWLD->WLD( searchStr.getStr() + nStt, nEnd - nStt ) <= nLimit ) |
1141 | 0 | { |
1142 | 0 | aRet.subRegExpressions = 1; |
1143 | 0 | aRet.startOffset = { nEnd }; |
1144 | 0 | aRet.endOffset = { nStt }; |
1145 | 0 | break; |
1146 | 0 | } |
1147 | 0 | if( !nStt ) |
1148 | 0 | break; |
1149 | | |
1150 | 0 | aWBnd = xBreak->previousWord( searchStr, nStt, aSrchPara.Locale, |
1151 | 0 | WordType::ANYWORD_IGNOREWHITESPACES); |
1152 | 0 | } while( aWBnd.startPos != aWBnd.endPos || aWBnd.endPos != searchStr.getLength() ); |
1153 | 0 | return aRet; |
1154 | 0 | } |
1155 | | |
1156 | | |
1157 | | namespace { |
1158 | | void setWildcardMatch( css::util::SearchResult& rRes, sal_Int32 nStartOffset, sal_Int32 nEndOffset ) |
1159 | 170 | { |
1160 | 170 | rRes.subRegExpressions = 1; |
1161 | 170 | rRes.startOffset = { nStartOffset }; |
1162 | 170 | rRes.endOffset = { nEndOffset }; |
1163 | 170 | } |
1164 | | } |
1165 | | |
1166 | | SearchResult TextSearch::WildcardSrchFrwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos ) |
1167 | 1.29k | { |
1168 | 1.29k | SearchResult aRes; |
1169 | 1.29k | aRes.subRegExpressions = 0; // no match |
1170 | 1.29k | sal_Int32 nStartOffset = nStartPos; |
1171 | 1.29k | sal_Int32 nEndOffset = nEndPos; |
1172 | | |
1173 | 1.29k | const sal_Int32 nStringLen = searchStr.getLength(); |
1174 | | |
1175 | | // Forward nStartPos inclusive, nEndPos exclusive, but allow for empty |
1176 | | // string match with [0,0). |
1177 | 1.29k | if (nStartPos < 0 || nEndPos > nStringLen || nEndPos < nStartPos || |
1178 | 1.29k | (nStartPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos))) |
1179 | 0 | return aRes; |
1180 | | |
1181 | 1.29k | const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2); |
1182 | 1.29k | const sal_Int32 nPatternLen = rPattern.getLength(); |
1183 | | |
1184 | | // Handle special cases empty pattern and/or string outside of the loop to |
1185 | | // not add performance penalties there and simplify. |
1186 | 1.29k | if (nStartPos == nEndPos) |
1187 | 0 | { |
1188 | 0 | sal_Int32 i = 0; |
1189 | 0 | while (i < nPatternLen && rPattern[i] == '*') |
1190 | 0 | ++i; |
1191 | 0 | if (i == nPatternLen) |
1192 | 0 | setWildcardMatch( aRes, nStartOffset, nEndOffset); |
1193 | 0 | return aRes; |
1194 | 0 | } |
1195 | | |
1196 | | // Empty pattern does not match any non-empty string. |
1197 | 1.29k | if (!nPatternLen) |
1198 | 0 | return aRes; |
1199 | | |
1200 | 1.29k | bool bRewind = false; |
1201 | 1.29k | sal_uInt32 cPattern = 0; |
1202 | 1.29k | sal_Int32 nPattern = 0; |
1203 | 1.29k | sal_Int32 nAfterFakePattern = nPattern; |
1204 | 1.29k | if (mbWildcardAllowSubstring) |
1205 | 0 | { |
1206 | | // Fake a leading '*' wildcard. |
1207 | 0 | cPattern = '*'; |
1208 | 0 | bRewind = true; |
1209 | | // Assume a non-'*' pattern character follows. If it is a '*' instead |
1210 | | // that will be handled in the loop by setting nPat. |
1211 | 0 | sal_uInt32 cu = rPattern.iterateCodePoints( &nAfterFakePattern); |
1212 | 0 | if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern < nPatternLen) |
1213 | 0 | rPattern.iterateCodePoints( &nAfterFakePattern); |
1214 | 0 | } |
1215 | | |
1216 | 1.29k | sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1; |
1217 | 1.29k | sal_uInt32 cPatternAfterAsterisk = 0; |
1218 | 1.29k | bool bEscaped = false, bEscapedAfterAsterisk = false; |
1219 | | |
1220 | | // The loop code tries to avoid costly calls to iterateCodePoints() when |
1221 | | // possible. |
1222 | | |
1223 | 1.29k | do |
1224 | 3.32k | { |
1225 | 3.32k | if (bRewind) |
1226 | 0 | { |
1227 | | // Reuse cPattern after '*', nPattern was correspondingly |
1228 | | // incremented to point behind cPattern. |
1229 | 0 | bRewind = false; |
1230 | 0 | } |
1231 | 3.32k | else if (nPattern < nPatternLen) |
1232 | 3.32k | { |
1233 | | // nPattern will be incremented by iterateCodePoints(). |
1234 | 3.32k | cPattern = rPattern.iterateCodePoints( &nPattern); |
1235 | 3.32k | if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen) |
1236 | 0 | { |
1237 | 0 | bEscaped = true; |
1238 | 0 | cPattern = rPattern.iterateCodePoints( &nPattern); |
1239 | 0 | } |
1240 | 3.32k | } |
1241 | 0 | else |
1242 | 0 | { |
1243 | | // A trailing '*' is handled below. |
1244 | 0 | if (mbWildcardAllowSubstring) |
1245 | 0 | { |
1246 | | // If the pattern is consumed and substring match allowed we're good. |
1247 | 0 | setWildcardMatch( aRes, nStartOffset, nString); |
1248 | 0 | return aRes; |
1249 | 0 | } |
1250 | 0 | else if (nString < nEndPos && nLastAsterisk >= 0) |
1251 | 0 | { |
1252 | | // If substring match is not allowed try a greedy '*' match. |
1253 | 0 | nPattern = nLastAsterisk; |
1254 | 0 | continue; // do |
1255 | 0 | } |
1256 | 0 | else |
1257 | 0 | return aRes; |
1258 | 0 | } |
1259 | | |
1260 | 3.32k | if (cPattern == '*' && !bEscaped) |
1261 | 170 | { |
1262 | | // '*' is one code unit, so not using iterateCodePoints() is ok. |
1263 | 170 | while (nPattern < nPatternLen && rPattern[nPattern] == '*') |
1264 | 0 | ++nPattern; |
1265 | | |
1266 | 170 | if (nPattern >= nPatternLen) |
1267 | 170 | { |
1268 | | // Last pattern is '*', remaining string matches. |
1269 | 170 | setWildcardMatch( aRes, nStartOffset, nEndOffset); |
1270 | 170 | return aRes; |
1271 | 170 | } |
1272 | | |
1273 | 0 | nLastAsterisk = nPattern; // Remember last encountered '*'. |
1274 | | |
1275 | | // cPattern will be the next non-'*' character, nPattern |
1276 | | // incremented. |
1277 | 0 | cPattern = rPattern.iterateCodePoints( &nPattern); |
1278 | 0 | if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern < nPatternLen) |
1279 | 0 | { |
1280 | 0 | bEscaped = true; |
1281 | 0 | cPattern = rPattern.iterateCodePoints( &nPattern); |
1282 | 0 | } |
1283 | |
|
1284 | 0 | cPatternAfterAsterisk = cPattern; |
1285 | 0 | bEscapedAfterAsterisk = bEscaped; |
1286 | 0 | nPat = nPattern; // Remember position of pattern behind '*', already incremented. |
1287 | 0 | nStr = nString; // Remember the current string to be matched. |
1288 | 0 | } |
1289 | | |
1290 | 3.15k | if (nString >= nEndPos) |
1291 | | // Whatever follows in pattern, string will not match. |
1292 | 0 | return aRes; |
1293 | | |
1294 | | // nString will be incremented by iterateCodePoints(). |
1295 | 3.15k | sal_uInt32 cString = searchStr.iterateCodePoints( &nString); |
1296 | | |
1297 | 3.15k | if ((cPattern != '?' || bEscaped) && cPattern != cString) |
1298 | 1.12k | { |
1299 | 1.12k | if (nPat == -1) |
1300 | | // Non-match already without any '*' pattern. |
1301 | 1.12k | return aRes; |
1302 | | |
1303 | 0 | bRewind = true; |
1304 | 0 | nPattern = nPat; // Rewind pattern to character behind '*', already incremented. |
1305 | 0 | cPattern = cPatternAfterAsterisk; |
1306 | 0 | bEscaped = bEscapedAfterAsterisk; |
1307 | 0 | searchStr.iterateCodePoints( &nStr); |
1308 | 0 | nString = nStr; // Restore incremented remembered string position. |
1309 | 0 | if (nPat == nAfterFakePattern) |
1310 | 0 | { |
1311 | | // Next start offset will be the next character. |
1312 | 0 | nStartOffset = nString; |
1313 | 0 | } |
1314 | 0 | } |
1315 | 2.02k | else |
1316 | 2.02k | { |
1317 | | // An unescaped '?' pattern matched any character, or characters |
1318 | | // matched. Reset only escaped state. |
1319 | 2.02k | bEscaped = false; |
1320 | 2.02k | } |
1321 | 3.15k | } |
1322 | 2.02k | while (nString < nEndPos); |
1323 | | |
1324 | 0 | if (bRewind) |
1325 | 0 | return aRes; |
1326 | | |
1327 | | // Eat trailing '*' pattern that matches anything, including nothing. |
1328 | | // '*' is one code unit, so not using iterateCodePoints() is ok. |
1329 | 0 | while (nPattern < nPatternLen && rPattern[nPattern] == '*') |
1330 | 0 | ++nPattern; |
1331 | |
|
1332 | 0 | if (nPattern == nPatternLen) |
1333 | 0 | setWildcardMatch( aRes, nStartOffset, nEndOffset); |
1334 | 0 | return aRes; |
1335 | 0 | } |
1336 | | |
1337 | | SearchResult TextSearch::WildcardSrchBkwrd( std::unique_lock<std::mutex>& /*rGuard*/, const OUString& searchStr, sal_Int32 nStartPos, sal_Int32 nEndPos ) |
1338 | 0 | { |
1339 | 0 | SearchResult aRes; |
1340 | 0 | aRes.subRegExpressions = 0; // no match |
1341 | |
|
1342 | 0 | sal_Int32 nStartOffset = nStartPos; |
1343 | 0 | sal_Int32 nEndOffset = nEndPos; |
1344 | |
|
1345 | 0 | const sal_Int32 nStringLen = searchStr.getLength(); |
1346 | | |
1347 | | // Backward nStartPos exclusive, nEndPos inclusive, but allow for empty |
1348 | | // string match with (0,0]. |
1349 | 0 | if (nStartPos > nStringLen || nEndPos < 0 || nStartPos < nEndPos || |
1350 | 0 | (nEndPos == nStringLen && (nStringLen != 0 || nStartPos != nEndPos))) |
1351 | 0 | return aRes; |
1352 | | |
1353 | 0 | const OUString& rPattern = (bUsePrimarySrchStr ? sSrchStr : sSrchStr2); |
1354 | 0 | sal_Int32 nPatternLen = rPattern.getLength(); |
1355 | | |
1356 | | // Handle special cases empty pattern and/or string outside of the loop to |
1357 | | // not add performance penalties there and simplify. |
1358 | 0 | if (nStartPos == nEndPos) |
1359 | 0 | { |
1360 | 0 | sal_Int32 i = 0; |
1361 | 0 | while (i < nPatternLen && rPattern[i] == '*') |
1362 | 0 | ++i; |
1363 | 0 | if (i == nPatternLen) |
1364 | 0 | setWildcardMatch( aRes, nStartOffset, nEndOffset); |
1365 | 0 | return aRes; |
1366 | 0 | } |
1367 | | |
1368 | | // Empty pattern does not match any non-empty string. |
1369 | 0 | if (!nPatternLen) |
1370 | 0 | return aRes; |
1371 | | |
1372 | | // Reverse escaped patterns to ease the handling of escapes, keeping escape |
1373 | | // and following character as one sequence in backward direction. |
1374 | 0 | if ((bUsePrimarySrchStr && maWildcardReversePattern.isEmpty()) || |
1375 | 0 | (!bUsePrimarySrchStr && maWildcardReversePattern2.isEmpty())) |
1376 | 0 | { |
1377 | 0 | OUStringBuffer aPatternBuf( rPattern); |
1378 | 0 | sal_Int32 nIndex = 0; |
1379 | 0 | while (nIndex < nPatternLen) |
1380 | 0 | { |
1381 | 0 | const sal_Int32 nOld = nIndex; |
1382 | 0 | const sal_uInt32 cu = rPattern.iterateCodePoints( &nIndex); |
1383 | 0 | if (cu == mcWildcardEscapeChar) |
1384 | 0 | { |
1385 | 0 | if (nIndex < nPatternLen) |
1386 | 0 | { |
1387 | 0 | if (nIndex - nOld == 1) |
1388 | 0 | { |
1389 | | // Simply move code units, we already memorized the one |
1390 | | // in 'cu'. |
1391 | 0 | const sal_Int32 nOld2 = nIndex; |
1392 | 0 | rPattern.iterateCodePoints( &nIndex); |
1393 | 0 | for (sal_Int32 i=0; i < nIndex - nOld2; ++i) |
1394 | 0 | aPatternBuf[nOld+i] = rPattern[nOld2+i]; |
1395 | 0 | aPatternBuf[nIndex-1] = static_cast<sal_Unicode>(cu); |
1396 | 0 | } |
1397 | 0 | else |
1398 | 0 | { |
1399 | | // Copy the escape character code units first in the |
1400 | | // unlikely case that it would not be of BMP. |
1401 | 0 | assert(nIndex - nOld == 2); // it's UTF-16, so... |
1402 | 0 | sal_Unicode buf[2]; |
1403 | 0 | buf[0] = rPattern[nOld]; |
1404 | 0 | buf[1] = rPattern[nOld+1]; |
1405 | 0 | const sal_Int32 nOld2 = nIndex; |
1406 | 0 | rPattern.iterateCodePoints( &nIndex); |
1407 | 0 | for (sal_Int32 i=0; i < nIndex - nOld2; ++i) |
1408 | 0 | aPatternBuf[nOld+i] = rPattern[nOld2+i]; |
1409 | 0 | aPatternBuf[nIndex-2] = buf[0]; |
1410 | 0 | aPatternBuf[nIndex-1] = buf[1]; |
1411 | 0 | } |
1412 | 0 | } |
1413 | 0 | else |
1414 | 0 | { |
1415 | | // Trailing escape would become leading escape, do what? |
1416 | | // Eliminate. |
1417 | 0 | aPatternBuf.remove( nOld, nIndex - nOld); |
1418 | 0 | } |
1419 | 0 | } |
1420 | 0 | } |
1421 | 0 | if (bUsePrimarySrchStr) |
1422 | 0 | maWildcardReversePattern = aPatternBuf.makeStringAndClear(); |
1423 | 0 | else |
1424 | 0 | maWildcardReversePattern2 = aPatternBuf.makeStringAndClear(); |
1425 | 0 | } |
1426 | 0 | const OUString& rReversePattern = (bUsePrimarySrchStr ? maWildcardReversePattern : maWildcardReversePattern2); |
1427 | 0 | nPatternLen = rReversePattern.getLength(); |
1428 | |
|
1429 | 0 | bool bRewind = false; |
1430 | 0 | sal_uInt32 cPattern = 0; |
1431 | 0 | sal_Int32 nPattern = nPatternLen; |
1432 | 0 | sal_Int32 nAfterFakePattern = nPattern; |
1433 | 0 | if (mbWildcardAllowSubstring) |
1434 | 0 | { |
1435 | | // Fake a trailing '*' wildcard. |
1436 | 0 | cPattern = '*'; |
1437 | 0 | bRewind = true; |
1438 | | // Assume a non-'*' pattern character follows. If it is a '*' instead |
1439 | | // that will be handled in the loop by setting nPat. |
1440 | 0 | sal_uInt32 cu = rReversePattern.iterateCodePoints( &nAfterFakePattern, -1); |
1441 | 0 | if (cu == mcWildcardEscapeChar && mcWildcardEscapeChar && nAfterFakePattern > 0) |
1442 | 0 | rReversePattern.iterateCodePoints( &nAfterFakePattern, -1); |
1443 | 0 | } |
1444 | |
|
1445 | 0 | sal_Int32 nString = nStartPos, nPat = -1, nStr = -1, nLastAsterisk = -1; |
1446 | 0 | sal_uInt32 cPatternAfterAsterisk = 0; |
1447 | 0 | bool bEscaped = false, bEscapedAfterAsterisk = false; |
1448 | | |
1449 | | // The loop code tries to avoid costly calls to iterateCodePoints() when |
1450 | | // possible. |
1451 | |
|
1452 | 0 | do |
1453 | 0 | { |
1454 | 0 | if (bRewind) |
1455 | 0 | { |
1456 | | // Reuse cPattern after '*', nPattern was correspondingly |
1457 | | // decremented to point before cPattern. |
1458 | 0 | bRewind = false; |
1459 | 0 | } |
1460 | 0 | else if (nPattern > 0) |
1461 | 0 | { |
1462 | | // nPattern will be decremented by iterateCodePoints(). |
1463 | 0 | cPattern = rReversePattern.iterateCodePoints( &nPattern, -1); |
1464 | 0 | if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0) |
1465 | 0 | { |
1466 | 0 | bEscaped = true; |
1467 | 0 | cPattern = rReversePattern.iterateCodePoints( &nPattern, -1); |
1468 | 0 | } |
1469 | 0 | } |
1470 | 0 | else |
1471 | 0 | { |
1472 | | // A trailing '*' is handled below. |
1473 | 0 | if (mbWildcardAllowSubstring) |
1474 | 0 | { |
1475 | | // If the pattern is consumed and substring match allowed we're good. |
1476 | 0 | setWildcardMatch( aRes, nStartOffset, nString); |
1477 | 0 | return aRes; |
1478 | 0 | } |
1479 | 0 | else if (nString > nEndPos && nLastAsterisk >= 0) |
1480 | 0 | { |
1481 | | // If substring match is not allowed try a greedy '*' match. |
1482 | 0 | nPattern = nLastAsterisk; |
1483 | 0 | continue; // do |
1484 | 0 | } |
1485 | 0 | else |
1486 | 0 | return aRes; |
1487 | 0 | } |
1488 | | |
1489 | 0 | if (cPattern == '*' && !bEscaped) |
1490 | 0 | { |
1491 | | // '*' is one code unit, so not using iterateCodePoints() is ok. |
1492 | 0 | while (nPattern > 0 && rReversePattern[nPattern-1] == '*') |
1493 | 0 | --nPattern; |
1494 | |
|
1495 | 0 | if (nPattern <= 0) |
1496 | 0 | { |
1497 | | // First pattern is '*', remaining string matches. |
1498 | 0 | setWildcardMatch( aRes, nStartOffset, nEndOffset); |
1499 | 0 | return aRes; |
1500 | 0 | } |
1501 | | |
1502 | 0 | nLastAsterisk = nPattern; // Remember last encountered '*'. |
1503 | | |
1504 | | // cPattern will be the previous non-'*' character, nPattern |
1505 | | // decremented. |
1506 | 0 | cPattern = rReversePattern.iterateCodePoints( &nPattern, -1); |
1507 | 0 | if (cPattern == mcWildcardEscapeChar && mcWildcardEscapeChar && nPattern > 0) |
1508 | 0 | { |
1509 | 0 | bEscaped = true; |
1510 | 0 | cPattern = rReversePattern.iterateCodePoints( &nPattern, -1); |
1511 | 0 | } |
1512 | |
|
1513 | 0 | cPatternAfterAsterisk = cPattern; |
1514 | 0 | bEscapedAfterAsterisk = bEscaped; |
1515 | 0 | nPat = nPattern; // Remember position of pattern before '*', already decremented. |
1516 | 0 | nStr = nString; // Remember the current string to be matched. |
1517 | 0 | } |
1518 | | |
1519 | 0 | if (nString <= nEndPos) |
1520 | | // Whatever leads in pattern, string will not match. |
1521 | 0 | return aRes; |
1522 | | |
1523 | | // nString will be decremented by iterateCodePoints(). |
1524 | 0 | sal_uInt32 cString = searchStr.iterateCodePoints( &nString, -1); |
1525 | |
|
1526 | 0 | if ((cPattern != '?' || bEscaped) && cPattern != cString) |
1527 | 0 | { |
1528 | 0 | if (nPat == -1) |
1529 | | // Non-match already without any '*' pattern. |
1530 | 0 | return aRes; |
1531 | | |
1532 | 0 | bRewind = true; |
1533 | 0 | nPattern = nPat; // Rewind pattern to character before '*', already decremented. |
1534 | 0 | cPattern = cPatternAfterAsterisk; |
1535 | 0 | bEscaped = bEscapedAfterAsterisk; |
1536 | 0 | searchStr.iterateCodePoints( &nStr, -1); |
1537 | 0 | nString = nStr; // Restore decremented remembered string position. |
1538 | 0 | if (nPat == nAfterFakePattern) |
1539 | 0 | { |
1540 | | // Next start offset will be this character (exclusive). |
1541 | 0 | nStartOffset = nString; |
1542 | 0 | } |
1543 | 0 | } |
1544 | 0 | else |
1545 | 0 | { |
1546 | | // An unescaped '?' pattern matched any character, or characters |
1547 | | // matched. Reset only escaped state. |
1548 | 0 | bEscaped = false; |
1549 | 0 | } |
1550 | 0 | } |
1551 | 0 | while (nString > nEndPos); |
1552 | | |
1553 | 0 | if (bRewind) |
1554 | 0 | return aRes; |
1555 | | |
1556 | | // Eat leading '*' pattern that matches anything, including nothing. |
1557 | | // '*' is one code unit, so not using iterateCodePoints() is ok. |
1558 | 0 | while (nPattern > 0 && rReversePattern[nPattern-1] == '*') |
1559 | 0 | --nPattern; |
1560 | |
|
1561 | 0 | if (nPattern == 0) |
1562 | 0 | setWildcardMatch( aRes, nStartOffset, nEndOffset); |
1563 | 0 | return aRes; |
1564 | 0 | } |
1565 | | |
1566 | | |
1567 | | OUString SAL_CALL |
1568 | | TextSearch::getImplementationName() |
1569 | 0 | { |
1570 | 0 | return u"com.sun.star.util.TextSearch_i18n"_ustr; |
1571 | 0 | } |
1572 | | |
1573 | | sal_Bool SAL_CALL TextSearch::supportsService(const OUString& rServiceName) |
1574 | 0 | { |
1575 | 0 | return cppu::supportsService(this, rServiceName); |
1576 | 0 | } |
1577 | | |
1578 | | Sequence< OUString > SAL_CALL |
1579 | | TextSearch::getSupportedServiceNames() |
1580 | 0 | { |
1581 | 0 | return { u"com.sun.star.util.TextSearch"_ustr, u"com.sun.star.util.TextSearch2"_ustr }; |
1582 | 0 | } |
1583 | | |
1584 | | extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface* |
1585 | | i18npool_TextSearch_get_implementation( |
1586 | | css::uno::XComponentContext* context , css::uno::Sequence<css::uno::Any> const&) |
1587 | 3 | { |
1588 | 3 | return cppu::acquire(new TextSearch(context)); |
1589 | 3 | } |
1590 | | |
1591 | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |