/src/libreoffice/unotools/source/i18n/textsearch.cxx
Line | Count | Source |
1 | | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | /* |
3 | | * This file is part of the LibreOffice project. |
4 | | * |
5 | | * This Source Code Form is subject to the terms of the Mozilla Public |
6 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
7 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8 | | * |
9 | | * This file incorporates work covered by the following license notice: |
10 | | * |
11 | | * Licensed to the Apache Software Foundation (ASF) under one or more |
12 | | * contributor license agreements. See the NOTICE file distributed |
13 | | * with this work for additional information regarding copyright |
14 | | * ownership. The ASF licenses this file to you under the Apache |
15 | | * License, Version 2.0 (the "License"); you may not use this file |
16 | | * except in compliance with the License. You may obtain a copy of |
17 | | * the License at http://www.apache.org/licenses/LICENSE-2.0 . |
18 | | */ |
19 | | |
20 | | #include <sal/config.h> |
21 | | |
22 | | #include <cstdlib> |
23 | | #include <string_view> |
24 | | |
25 | | #include <i18nlangtag/languagetag.hxx> |
26 | | #include <i18nutil/searchopt.hxx> |
27 | | #include <i18nutil/transliteration.hxx> |
28 | | #include <com/sun/star/util/TextSearch2.hpp> |
29 | | #include <com/sun/star/util/SearchAlgorithms2.hpp> |
30 | | #include <com/sun/star/util/SearchFlags.hpp> |
31 | | #include <unotools/charclass.hxx> |
32 | | #include <comphelper/processfactory.hxx> |
33 | | #include <unotools/textsearch.hxx> |
34 | | #include <rtl/ustrbuf.hxx> |
35 | | #include <comphelper/diagnose_ex.hxx> |
36 | | #include <mutex> |
37 | | |
38 | | using namespace ::com::sun::star::util; |
39 | | using namespace ::com::sun::star::uno; |
40 | | using namespace ::com::sun::star::lang; |
41 | | |
42 | | namespace utl |
43 | | { |
44 | | |
45 | | SearchParam::SearchParam( const OUString &rText, |
46 | | SearchType eType, |
47 | | bool bCaseSensitive, |
48 | | sal_uInt32 cWildEscChar, |
49 | | bool bWildMatchSel ) |
50 | 48 | { |
51 | 48 | sSrchStr = rText; |
52 | 48 | m_eSrchType = eType; |
53 | | |
54 | 48 | m_cWildEscChar = cWildEscChar; |
55 | | |
56 | 48 | m_bCaseSense = bCaseSensitive; |
57 | 48 | m_bWildMatchSel = bWildMatchSel; |
58 | 48 | } |
59 | | |
60 | | SearchParam::SearchParam( const SearchParam& rParam ) |
61 | 0 | { |
62 | 0 | sSrchStr = rParam.sSrchStr; |
63 | 0 | m_eSrchType = rParam.m_eSrchType; |
64 | |
|
65 | 0 | m_cWildEscChar = rParam.m_cWildEscChar; |
66 | |
|
67 | 0 | m_bCaseSense = rParam.m_bCaseSense; |
68 | 0 | m_bWildMatchSel = rParam.m_bWildMatchSel; |
69 | 0 | } |
70 | | |
71 | 48 | SearchParam::~SearchParam() {} |
72 | | |
73 | | static bool lcl_Equals( const i18nutil::SearchOptions2& rSO1, const i18nutil::SearchOptions2& rSO2 ) |
74 | 48 | { |
75 | 48 | return |
76 | 48 | rSO1.AlgorithmType2 == rSO2.AlgorithmType2 && |
77 | 46 | rSO1.WildcardEscapeCharacter == rSO2.WildcardEscapeCharacter && |
78 | 46 | rSO1.searchFlag == rSO2.searchFlag && |
79 | 46 | rSO1.searchString == rSO2.searchString && |
80 | 45 | rSO1.replaceString == rSO2.replaceString && |
81 | 45 | rSO1.changedChars == rSO2.changedChars && |
82 | 45 | rSO1.deletedChars == rSO2.deletedChars && |
83 | 45 | rSO1.insertedChars == rSO2.insertedChars && |
84 | 45 | rSO1.Locale.Language == rSO2.Locale.Language && |
85 | 45 | rSO1.Locale.Country == rSO2.Locale.Country && |
86 | 45 | rSO1.Locale.Variant == rSO2.Locale.Variant && |
87 | 45 | rSO1.transliterateFlags == rSO2.transliterateFlags; |
88 | 48 | } |
89 | | |
90 | | namespace |
91 | | { |
92 | | struct CachedTextSearch |
93 | | { |
94 | | std::mutex mutex; |
95 | | i18nutil::SearchOptions2 Options; |
96 | | css::uno::Reference< css::util::XTextSearch2 > xTextSearch; |
97 | | }; |
98 | | } |
99 | | |
100 | | Reference<XTextSearch2> TextSearch::getXTextSearch( const i18nutil::SearchOptions2& rPara ) |
101 | 48 | { |
102 | 48 | static CachedTextSearch theCachedTextSearch; |
103 | | |
104 | 48 | std::scoped_lock aGuard(theCachedTextSearch.mutex); |
105 | | |
106 | 48 | if ( lcl_Equals(theCachedTextSearch.Options, rPara) ) |
107 | 45 | return theCachedTextSearch.xTextSearch; |
108 | | |
109 | 3 | const Reference< XComponentContext >& xContext = ::comphelper::getProcessComponentContext(); |
110 | 3 | theCachedTextSearch.xTextSearch.set( ::TextSearch2::create(xContext) ); |
111 | 3 | theCachedTextSearch.xTextSearch->setOptions2( rPara.toUnoSearchOptions2() ); |
112 | 3 | theCachedTextSearch.Options = rPara; |
113 | | |
114 | 3 | return theCachedTextSearch.xTextSearch; |
115 | 48 | } |
116 | | |
117 | | TextSearch::TextSearch(const SearchParam & rParam, LanguageType eLang ) |
118 | 0 | { |
119 | 0 | if( LANGUAGE_NONE == eLang ) |
120 | 0 | eLang = LANGUAGE_SYSTEM; |
121 | 0 | css::lang::Locale aLocale( LanguageTag::convertToLocale( eLang ) ); |
122 | |
|
123 | 0 | Init( rParam, aLocale); |
124 | 0 | } |
125 | | |
126 | | TextSearch::TextSearch(const SearchParam & rParam, const CharClass& rCClass ) |
127 | 48 | { |
128 | 48 | Init( rParam, rCClass.getLanguageTag().getLocale() ); |
129 | 48 | } |
130 | | |
131 | | TextSearch::TextSearch( const i18nutil::SearchOptions2& rPara ) |
132 | 0 | { |
133 | 0 | xTextSearch = getXTextSearch( rPara ); |
134 | 0 | } |
135 | | |
136 | | void TextSearch::Init( const SearchParam & rParam, |
137 | | const css::lang::Locale& rLocale ) |
138 | 48 | { |
139 | | // convert SearchParam to the UNO SearchOptions2 |
140 | 48 | i18nutil::SearchOptions2 aSOpt; |
141 | | |
142 | 48 | switch( rParam.GetSrchType() ) |
143 | 48 | { |
144 | 16 | case SearchParam::SearchType::Wildcard: |
145 | 16 | aSOpt.AlgorithmType2 = SearchAlgorithms2::WILDCARD; |
146 | 16 | aSOpt.WildcardEscapeCharacter = rParam.GetWildEscChar(); |
147 | 16 | if (rParam.IsWildMatchSel()) |
148 | 16 | aSOpt.searchFlag |= SearchFlags::WILD_MATCH_SELECTION; |
149 | 16 | break; |
150 | | |
151 | 0 | case SearchParam::SearchType::Regexp: |
152 | 0 | aSOpt.AlgorithmType2 = SearchAlgorithms2::REGEXP; |
153 | 0 | break; |
154 | | |
155 | 32 | case SearchParam::SearchType::Normal: |
156 | 32 | aSOpt.AlgorithmType2 = SearchAlgorithms2::ABSOLUTE; |
157 | 32 | break; |
158 | | |
159 | 0 | default: |
160 | 0 | for (;;) std::abort(); |
161 | 48 | } |
162 | 48 | aSOpt.searchString = rParam.GetSrchStr(); |
163 | 48 | aSOpt.replaceString = ""; |
164 | 48 | aSOpt.Locale = rLocale; |
165 | 48 | aSOpt.transliterateFlags = TransliterationFlags::NONE; |
166 | 48 | if( !rParam.IsCaseSensitive() ) |
167 | 48 | { |
168 | 48 | aSOpt.searchFlag |= SearchFlags::ALL_IGNORE_CASE; |
169 | 48 | aSOpt.transliterateFlags |= TransliterationFlags::IGNORE_CASE; |
170 | 48 | } |
171 | | |
172 | 48 | xTextSearch = getXTextSearch( aSOpt ); |
173 | 48 | } |
174 | | |
175 | | void TextSearch::SetLocale( const i18nutil::SearchOptions2& rOptions, |
176 | | const css::lang::Locale& rLocale ) |
177 | 0 | { |
178 | 0 | i18nutil::SearchOptions2 aSOpt( rOptions ); |
179 | 0 | aSOpt.Locale = rLocale; |
180 | |
|
181 | 0 | xTextSearch = getXTextSearch( aSOpt ); |
182 | 0 | } |
183 | | |
184 | | TextSearch::~TextSearch() |
185 | 48 | { |
186 | 48 | } |
187 | | |
188 | | /* |
189 | | * General search methods. These methods will call the respective |
190 | | * methods, such as ordinary string searching or regular expression |
191 | | * matching, using the method pointer. |
192 | | */ |
193 | | bool TextSearch::SearchForward( const OUString &rStr, |
194 | | sal_Int32* pStart, sal_Int32* pEnd, |
195 | | css::util::SearchResult* pRes) |
196 | 1.19k | { |
197 | 1.19k | bool bRet = false; |
198 | 1.19k | try |
199 | 1.19k | { |
200 | 1.19k | if( xTextSearch.is() ) |
201 | 1.19k | { |
202 | 1.19k | SearchResult aRet( xTextSearch->searchForward( rStr, *pStart, *pEnd )); |
203 | 1.19k | if( aRet.subRegExpressions > 0 ) |
204 | 210 | { |
205 | 210 | bRet = true; |
206 | | // the XTextsearch returns in startOffset the higher position |
207 | | // and the endposition is always exclusive. |
208 | | // The caller of this function will have in startPos the |
209 | | // lower pos. and end |
210 | 210 | *pStart = aRet.startOffset[ 0 ]; |
211 | 210 | *pEnd = aRet.endOffset[ 0 ]; |
212 | 210 | if( pRes ) |
213 | 0 | *pRes = std::move(aRet); |
214 | 210 | } |
215 | 1.19k | } |
216 | 1.19k | } |
217 | 1.19k | catch ( Exception& ) |
218 | 1.19k | { |
219 | 0 | TOOLS_WARN_EXCEPTION("unotools.i18n", "" ); |
220 | 0 | } |
221 | 1.19k | return bRet; |
222 | 1.19k | } |
223 | | |
224 | | bool TextSearch::searchForward( const OUString &rStr ) |
225 | 0 | { |
226 | 0 | sal_Int32 pStart = 0; |
227 | 0 | sal_Int32 pEnd = rStr.getLength(); |
228 | |
|
229 | 0 | bool bResult = SearchForward(rStr, &pStart, &pEnd); |
230 | |
|
231 | 0 | return bResult; |
232 | 0 | } |
233 | | |
234 | | bool TextSearch::SearchBackward( const OUString & rStr, sal_Int32* pStart, |
235 | | sal_Int32* pEnd, SearchResult* pRes ) |
236 | 0 | { |
237 | 0 | bool bRet = false; |
238 | 0 | try |
239 | 0 | { |
240 | 0 | if( xTextSearch.is() ) |
241 | 0 | { |
242 | 0 | SearchResult aRet( xTextSearch->searchBackward( rStr, *pStart, *pEnd )); |
243 | 0 | if( aRet.subRegExpressions ) |
244 | 0 | { |
245 | 0 | bRet = true; |
246 | | // the XTextsearch returns in startOffset the higher position |
247 | | // and the endposition is always exclusive. |
248 | | // The caller of this function will have in startPos the |
249 | | // lower pos. and end |
250 | 0 | *pEnd = aRet.startOffset[ 0 ]; |
251 | 0 | *pStart = aRet.endOffset[ 0 ]; |
252 | 0 | if( pRes ) |
253 | 0 | *pRes = std::move(aRet); |
254 | 0 | } |
255 | 0 | } |
256 | 0 | } |
257 | 0 | catch ( Exception& ) |
258 | 0 | { |
259 | 0 | TOOLS_WARN_EXCEPTION("unotools.i18n", "" ); |
260 | 0 | } |
261 | 0 | return bRet; |
262 | 0 | } |
263 | | |
264 | | // static |
265 | | void TextSearch::ReplaceBackReferences( OUString& rReplaceStr, std::u16string_view rStr, const SearchResult& rResult ) |
266 | 0 | { |
267 | 0 | if( rResult.subRegExpressions <= 0 ) |
268 | 0 | return; |
269 | | |
270 | 0 | sal_Unicode sFndChar; |
271 | 0 | sal_Int32 i; |
272 | 0 | OUStringBuffer sBuff(rReplaceStr.getLength()*4); |
273 | 0 | for(i = 0; i < rReplaceStr.getLength(); i++) |
274 | 0 | { |
275 | 0 | if( rReplaceStr[i] == '&') |
276 | 0 | { |
277 | 0 | sal_Int32 nStart = rResult.startOffset[0]; |
278 | 0 | sal_Int32 nLength = rResult.endOffset[0] - rResult.startOffset[0]; |
279 | 0 | sBuff.append(rStr.substr(nStart, nLength)); |
280 | 0 | } |
281 | 0 | else if((i < rReplaceStr.getLength() - 1) && rReplaceStr[i] == '$') |
282 | 0 | { |
283 | 0 | sFndChar = rReplaceStr[ i + 1 ]; |
284 | 0 | switch(sFndChar) |
285 | 0 | { // placeholder for a backward reference? |
286 | 0 | case '0': |
287 | 0 | case '1': |
288 | 0 | case '2': |
289 | 0 | case '3': |
290 | 0 | case '4': |
291 | 0 | case '5': |
292 | 0 | case '6': |
293 | 0 | case '7': |
294 | 0 | case '8': |
295 | 0 | case '9': |
296 | 0 | { |
297 | 0 | int j = sFndChar - '0'; // index |
298 | 0 | if(j < rResult.subRegExpressions) |
299 | 0 | { |
300 | 0 | sal_Int32 nSttReg = rResult.startOffset[j]; |
301 | 0 | sal_Int32 nRegLen = rResult.endOffset[j]; |
302 | 0 | if (nSttReg < 0 || nRegLen < 0) // A "not found" optional capture |
303 | 0 | { |
304 | 0 | nSttReg = nRegLen = 0; // Copy empty string |
305 | 0 | } |
306 | 0 | else if (nRegLen >= nSttReg) |
307 | 0 | { |
308 | 0 | nRegLen = nRegLen - nSttReg; |
309 | 0 | } |
310 | 0 | else |
311 | 0 | { |
312 | 0 | nRegLen = nSttReg - nRegLen; |
313 | 0 | nSttReg = rResult.endOffset[j]; |
314 | 0 | } |
315 | | // Copy reference from found string |
316 | 0 | sBuff.append(rStr.substr(nSttReg, nRegLen)); |
317 | 0 | } |
318 | 0 | i += 1; |
319 | 0 | } |
320 | 0 | break; |
321 | 0 | default: |
322 | 0 | sBuff.append(OUStringChar(rReplaceStr[i]) + OUStringChar(rReplaceStr[i+1])); |
323 | 0 | i += 1; |
324 | 0 | break; |
325 | 0 | } |
326 | 0 | } |
327 | 0 | else if((i < rReplaceStr.getLength() - 1) && rReplaceStr[i] == '\\') |
328 | 0 | { |
329 | 0 | sFndChar = rReplaceStr[ i+1 ]; |
330 | 0 | switch(sFndChar) |
331 | 0 | { |
332 | 0 | case '\\': |
333 | 0 | case '&': |
334 | 0 | case '$': |
335 | 0 | sBuff.append(sFndChar); |
336 | 0 | i+=1; |
337 | 0 | break; |
338 | 0 | case 't': |
339 | 0 | sBuff.append('\t'); |
340 | 0 | i += 1; |
341 | 0 | break; |
342 | 0 | default: |
343 | 0 | sBuff.append(OUStringChar(rReplaceStr[i]) + OUStringChar(rReplaceStr[i+1])); |
344 | 0 | i += 1; |
345 | 0 | break; |
346 | 0 | } |
347 | 0 | } |
348 | 0 | else |
349 | 0 | { |
350 | 0 | sBuff.append(rReplaceStr[i]); |
351 | 0 | } |
352 | 0 | } |
353 | 0 | rReplaceStr = sBuff.makeStringAndClear(); |
354 | 0 | } |
355 | | |
356 | | bool TextSearch::SimilaritySearch(const OUString& rString, const OUString& rSearchString, |
357 | | ::std::pair<sal_Int32, sal_Int32>& rSimilarityScore) |
358 | 0 | { |
359 | 0 | sal_Int32 nScore = 0; |
360 | 0 | sal_Int32 nFirstScore = GetSubstringSimilarity(rString, rSearchString, nScore, true); |
361 | 0 | if (nFirstScore == -1) |
362 | 0 | nFirstScore = GetSubstringSimilarity(rString, rSearchString, nScore, false); |
363 | 0 | if (nFirstScore == -1) |
364 | 0 | { |
365 | 0 | if (rSearchString.getLength() == 1) |
366 | 0 | { |
367 | 0 | if (rString.startsWith(rSearchString)) |
368 | 0 | nFirstScore = nScore; |
369 | 0 | else if (rString.endsWith(rSearchString)) |
370 | 0 | nFirstScore = nScore + 1; |
371 | 0 | nScore += 2; |
372 | 0 | } |
373 | 0 | else if (rString.getLength() == 1 && rSearchString.getLength() < SMALL_STRING_THRESHOLD) |
374 | 0 | { |
375 | 0 | if (rSearchString.startsWith(rString)) |
376 | 0 | nFirstScore = nScore; |
377 | 0 | else if (rSearchString.endsWith(rString)) |
378 | 0 | nFirstScore = nScore + 1; |
379 | 0 | nScore += 2; |
380 | 0 | } |
381 | 0 | } |
382 | 0 | sal_Int32 nSecondScore = GetWeightedLevenshteinDistance(rString, rSearchString); |
383 | |
|
384 | 0 | if (nFirstScore == -1 && nSecondScore >= WLD_THRESHOLD) |
385 | 0 | return false; |
386 | | |
387 | 0 | rSimilarityScore.first = (nFirstScore == -1) ? nScore : nFirstScore; |
388 | 0 | rSimilarityScore.second = nSecondScore; |
389 | 0 | return true; |
390 | 0 | } |
391 | | |
392 | | sal_Int32 TextSearch::GetSubstringSimilarity(std::u16string_view rString, |
393 | | std::u16string_view rSearchString, |
394 | | sal_Int32& nInitialScore, const bool bFromStart) |
395 | 0 | { |
396 | 0 | sal_Int32 nScore = -1; |
397 | 0 | for (sal_Int32 length = rSearchString.length(); length > 1; length--) |
398 | 0 | { |
399 | 0 | sal_Int32 nStartPos = bFromStart ? 0 : rSearchString.length() - length; |
400 | 0 | std::u16string_view rSearchSubString = rSearchString.substr(nStartPos, length); |
401 | 0 | if (rString.starts_with(rSearchSubString)) |
402 | 0 | { |
403 | 0 | nScore = nInitialScore; |
404 | 0 | break; |
405 | 0 | } |
406 | 0 | else if (rString.ends_with(rSearchSubString)) |
407 | 0 | { |
408 | 0 | nScore = nInitialScore + 1; |
409 | 0 | break; |
410 | 0 | } |
411 | 0 | else if (rString.find(rSearchSubString) != std::u16string_view::npos) |
412 | 0 | { |
413 | 0 | nScore = nInitialScore + 2; |
414 | 0 | break; |
415 | 0 | } |
416 | 0 | nInitialScore += 3; |
417 | 0 | } |
418 | 0 | return nScore; |
419 | 0 | } |
420 | | |
421 | | sal_Int32 TextSearch::GetWeightedLevenshteinDistance(const OUString& rString, |
422 | | const OUString& rSearchString) |
423 | 0 | { |
424 | 0 | sal_Int32 n = rString.getLength(); |
425 | 0 | sal_Int32 m = rSearchString.getLength(); |
426 | 0 | std::vector<std::vector<sal_Int32>> ScoreDP(n + 1, std::vector<sal_Int32>(m + 1)); |
427 | |
|
428 | 0 | for (sal_Int32 i = 0; i <= n; i++) |
429 | 0 | { |
430 | 0 | ScoreDP[i][0] = i; |
431 | 0 | } |
432 | 0 | for (sal_Int32 j = 0; j <= m; j++) |
433 | 0 | { |
434 | 0 | ScoreDP[0][j] = j; |
435 | 0 | } |
436 | |
|
437 | 0 | for (sal_Int32 i = 1; i <= n; i++) |
438 | 0 | { |
439 | 0 | for (sal_Int32 j = 1; j <= m; j++) |
440 | 0 | { |
441 | 0 | sal_Int32& minE = ScoreDP[i][j]; |
442 | 0 | minE = ScoreDP[i - 1][j] + 1; |
443 | 0 | minE = std::min(minE, ScoreDP[i][j - 1] + 1); |
444 | 0 | if (rString[i - 1] != rSearchString[j - 1]) |
445 | 0 | minE = std::min(minE, ScoreDP[i - 1][j - 1] + 2); |
446 | 0 | else |
447 | 0 | minE = std::min(minE, ScoreDP[i - 1][j - 1]); |
448 | 0 | } |
449 | 0 | } |
450 | 0 | return ScoreDP[n][m]; |
451 | 0 | } |
452 | | |
453 | | } // namespace utl |
454 | | |
455 | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |