/src/libreoffice/i18npool/source/search/levdis.cxx
Line | Count | Source |
1 | | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | /* |
3 | | * This file is part of the LibreOffice project. |
4 | | * |
5 | | * This Source Code Form is subject to the terms of the Mozilla Public |
6 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
7 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8 | | * |
9 | | * This file incorporates work covered by the following license notice: |
10 | | * |
11 | | * Licensed to the Apache Software Foundation (ASF) under one or more |
12 | | * contributor license agreements. See the NOTICE file distributed |
13 | | * with this work for additional information regarding copyright |
14 | | * ownership. The ASF licenses this file to you under the Apache |
15 | | * License, Version 2.0 (the "License"); you may not use this file |
16 | | * except in compliance with the License. You may obtain a copy of |
17 | | * the License at http://www.apache.org/licenses/LICENSE-2.0 . |
18 | | */ |
19 | | |
20 | | /* |
21 | | |
22 | | Weighted Levenshtein Distance |
23 | | including wildcards |
24 | | '*' for any number (0 or more) of arbitrary characters |
25 | | '?' for exactly one arbitrary character |
26 | | escapable with backslash, "\*" or "\?" |
27 | | |
28 | | Return: |
29 | | WLD if WLD <= nLimit, else nLimit+1 |
30 | | |
31 | | or, if bSplitCount: |
32 | | WLD if WLD <= nLimit |
33 | | -WLD if Replace and Insert and Delete <= nLimit |
34 | | else nLimit+1 |
35 | | |
36 | | Recursive definition of WLD: |
37 | | |
38 | | WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) , |
39 | | WLD( X(i) , Y(j-1) ) + q , |
40 | | WLD( X(i-1), Y(j) ) + r ) |
41 | | |
42 | | X(i) := the first i characters of the word X |
43 | | Y(j) := the first j characters of the word Y |
44 | | p(i,j) := 0 if i-th character of X == j-th character of Y, |
45 | | p else |
46 | | |
47 | | Boundary conditions: |
48 | | WLD( X(0), Y(j) ) := j*q (Y created by j inserts) |
49 | | WLD( X(i), Y(0) ) := i*r (Y created by i deletes) |
50 | | WLD( X(0), Y(0) ) := 0 |
51 | | |
52 | | Instead of recursions a dynamic algorithm is used. |
53 | | |
54 | | See also: German computer magazine |
55 | | c't 07/89 pages 192-208 and c't 03/94 pages 230-239 |
56 | | */ |
57 | | |
58 | | #include <algorithm> |
59 | | #include <numeric> |
60 | | |
61 | | #include "levdis.hxx" |
62 | | |
63 | 0 | #define LEVDISBIG (nLimit + 1) // Return value if distance > nLimit |
64 | 0 | #define LEVDISDOUBLEBUF 2048 // no doubling atop this border |
65 | | |
66 | | static sal_Int32 Impl_WLD_StringLen( const sal_Unicode* pStr ) |
67 | 0 | { |
68 | 0 | const sal_Unicode* pTempStr = pStr; |
69 | 0 | while( *pTempStr ) |
70 | 0 | pTempStr++; |
71 | 0 | return static_cast<sal_Int32>(pTempStr-pStr); |
72 | 0 | } |
73 | | |
74 | | // Distance from string to pattern |
75 | | int WLevDistance::WLD( const sal_Unicode* cString, sal_Int32 nStringLen ) |
76 | 0 | { |
77 | 0 | int nSPMin = 0; // penalty point Minimum |
78 | 0 | int nRepS = 0; // for SplitCount |
79 | | |
80 | | // length difference between pattern and string |
81 | 0 | int nLenDiff = nPatternLen - nStars - nStringLen; |
82 | | // more insertions or deletions necessary as the limit? Then leave |
83 | 0 | if ( (nLenDiff * nInsQ0 > nLimit) |
84 | 0 | || ((nStars == 0) && (nLenDiff * nDelR0 < -nLimit)) ) |
85 | 0 | return LEVDISBIG; |
86 | | |
87 | | // comparative String greater than instantaneous array |
88 | | // -> adapt array size |
89 | 0 | if ( nStringLen >= nArrayLen ) |
90 | 0 | { |
91 | | // increase size much more to avoid reallocation |
92 | 0 | if ( nStringLen < LEVDISDOUBLEBUF ) |
93 | 0 | nArrayLen = 2 * nStringLen; |
94 | 0 | else |
95 | 0 | nArrayLen = nStringLen + 1; |
96 | 0 | npDistance = aDisMem.NewMem( nArrayLen ); |
97 | 0 | } |
98 | | |
99 | | // Calculate start values of the second column (first pattern value). |
100 | | // First column (0-Len pattern) is always zero .. nStringLen * nInsQ0, |
101 | | // therefore the minimum is 0 |
102 | 0 | if ( nPatternLen == 0 ) |
103 | 0 | { |
104 | | // Count of deletions to reach pattern |
105 | 0 | for ( sal_Int32 i=0; i <= nStringLen; i++ ) |
106 | 0 | npDistance[i] = i * nDelR0; |
107 | 0 | } |
108 | 0 | else if ( cpPattern[0] == '*' && bpPatIsWild[0] ) |
109 | 0 | { |
110 | | // instead of a '*' you can fit in anything |
111 | 0 | for ( sal_Int32 i=0; i <= nStringLen; i++ ) |
112 | 0 | npDistance[i] = 0; |
113 | 0 | } |
114 | 0 | else |
115 | 0 | { |
116 | 0 | sal_Unicode c; |
117 | 0 | int nP; |
118 | 0 | c = cpPattern[0]; |
119 | 0 | if ( c == '?' && bpPatIsWild[0] ) |
120 | 0 | nP = 0; // a '?' could be any character. |
121 | 0 | else |
122 | | // Minimum of replacement and deletion+insertion weighting |
123 | 0 | nP = std::min({ nRepP0, nRepP0, nDelR0 + nInsQ0 }); |
124 | 0 | npDistance[0] = nInsQ0; // start with simple insert |
125 | 0 | npDistance[1] = nInsQ0; |
126 | 0 | npDistance[2] = nInsQ0; |
127 | 0 | int nReplacePos = -1; // tristate flag |
128 | 0 | int nDelCnt = 0; |
129 | 0 | for ( sal_Int32 i=1; i <= nStringLen; i++, nDelCnt += nDelR0 ) |
130 | 0 | { |
131 | 0 | if ( cString[i-1] == c ) |
132 | 0 | nP = 0; // Replace from this position is 0 |
133 | | // Deletions to match pattern + Replace |
134 | 0 | npDistance[i] = nDelCnt + nP; |
135 | 0 | if ( bSplitCount ) |
136 | 0 | { |
137 | 0 | if ( nReplacePos < 0 && nP ) |
138 | 0 | { // this position will be replaced |
139 | 0 | nRepS++; |
140 | 0 | nReplacePos = i; |
141 | 0 | } |
142 | 0 | else if ( nReplacePos > 0 && !nP ) |
143 | 0 | { |
144 | | // same count of c |
145 | 0 | int nBalance = levdisbalance( 0, i-1, c, cString, nStringLen ); |
146 | 0 | if ( !nBalance ) |
147 | 0 | { // one was replaced that was an insertion instead |
148 | 0 | nRepS--; |
149 | 0 | nReplacePos = 0; |
150 | 0 | } |
151 | 0 | } |
152 | 0 | } |
153 | 0 | } |
154 | 0 | nSPMin = std::min({ npDistance[0], npDistance[1], npDistance[2] }); |
155 | 0 | } |
156 | | |
157 | | // calculate distance matrix |
158 | 0 | sal_Int32 j = 0; // for all columns of the pattern, till limit is not reached |
159 | 0 | while ( (j < nPatternLen-1) |
160 | 0 | && nSPMin <= (bSplitCount ? 2 * nLimit : nLimit) ) |
161 | 0 | { |
162 | 0 | sal_Unicode c; |
163 | 0 | int nP, nQ, nR, nPij, d2; |
164 | |
|
165 | 0 | j++; |
166 | 0 | c = cpPattern[j]; |
167 | 0 | if ( bpPatIsWild[j] ) // '*' or '?' not escaped |
168 | 0 | nP = 0; // could be replaced without penalty |
169 | 0 | else |
170 | 0 | nP = nRepP0; |
171 | 0 | if ( c == '*' && bpPatIsWild[j] ) |
172 | 0 | { |
173 | 0 | nQ = 0; // insertion and deletion without penalty |
174 | 0 | nR = 0; |
175 | 0 | } |
176 | 0 | else |
177 | 0 | { |
178 | 0 | nQ = nInsQ0; // usual weighting |
179 | 0 | nR = nDelR0; |
180 | 0 | } |
181 | 0 | d2 = npDistance[0]; |
182 | | // increase insert count to get from null string to pattern |
183 | 0 | npDistance[0] = npDistance[0] + nQ; |
184 | 0 | nSPMin = npDistance[0]; |
185 | 0 | int nReplacePos = -1; // tristate flag |
186 | | // for each pattern column run through the string |
187 | 0 | for ( sal_Int32 i=1; i <= nStringLen; i++ ) |
188 | 0 | { |
189 | 0 | int d1 = d2; // WLD( X(i-1), Y(j-1) ) |
190 | 0 | d2 = npDistance[i]; // WLD( X(i) , Y(j-1) ) |
191 | 0 | if ( cString[i-1] == c ) |
192 | 0 | { |
193 | 0 | nPij = 0; // p(i,j) |
194 | 0 | if ( nReplacePos < 0 ) |
195 | 0 | { |
196 | | // same count of c |
197 | 0 | int nBalance = levdisbalance( j, i-1, c, cString, nStringLen ); |
198 | 0 | if ( !nBalance ) |
199 | 0 | nReplacePos = 0; // no replacement |
200 | 0 | } |
201 | 0 | } |
202 | 0 | else |
203 | 0 | nPij = nP; |
204 | | // WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) , |
205 | | // WLD( X(i) , Y(j-1) ) + q , |
206 | | // WLD( X(i-1), Y(j) ) + r ) |
207 | 0 | npDistance[i] = std::min({ d1 + nPij, d2 + nQ, npDistance[i-1] + nR }); |
208 | 0 | if ( npDistance[i] < nSPMin ) |
209 | 0 | nSPMin = npDistance[i]; |
210 | 0 | if ( bSplitCount ) |
211 | 0 | { |
212 | 0 | if ( nReplacePos < 0 && nPij && npDistance[i] == d1 + nPij ) |
213 | 0 | { // this position will be replaced |
214 | 0 | nRepS++; |
215 | 0 | nReplacePos = i; |
216 | 0 | } |
217 | 0 | else if ( nReplacePos > 0 && !nPij ) |
218 | 0 | { |
219 | | // character is equal in string and pattern |
220 | | // |
221 | | // If from this point: |
222 | | // * pattern and string have the same count of this |
223 | | // character |
224 | | // * and character count is the same before this position |
225 | | // then the replace was none. |
226 | | // |
227 | | // Scrambled letters are recognized here and the nRepS |
228 | | // replacement is withdrawn, whereby the double limit kicks |
229 | | // in. |
230 | | |
231 | | // Same count of c |
232 | 0 | int nBalance = levdisbalance( j, i-1, c, cString, nStringLen ); |
233 | 0 | if ( !nBalance ) |
234 | 0 | { // one was replaced that was an insertion instead |
235 | 0 | nRepS--; |
236 | 0 | nReplacePos = 0; |
237 | 0 | } |
238 | 0 | } |
239 | 0 | } |
240 | 0 | } |
241 | 0 | } |
242 | 0 | if ( (nSPMin <= nLimit) && (npDistance[nStringLen] <= nLimit) ) |
243 | 0 | return npDistance[nStringLen]; |
244 | 0 | else |
245 | 0 | { |
246 | 0 | if ( bSplitCount ) |
247 | 0 | { |
248 | 0 | if ( nRepS && nLenDiff > 0 ) |
249 | 0 | nRepS -= nLenDiff; // Inserts were counted |
250 | 0 | if ( (nSPMin <= 2 * nLimit) |
251 | 0 | && (npDistance[nStringLen] <= 2 * nLimit) |
252 | 0 | && (nRepS * nRepP0 <= nLimit) ) |
253 | 0 | return -npDistance[nStringLen]; |
254 | 0 | return LEVDISBIG; |
255 | 0 | } |
256 | 0 | return LEVDISBIG; |
257 | 0 | } |
258 | 0 | } |
259 | | |
260 | | // Calculating nLimit, nReplP0, nInsQ0, nDelR0, bSplitCount |
261 | | // from user values nOtherX, nShorterY, nLongerZ, bRelaxed |
262 | | void WLevDistance::CalcLPQR( int nX, int nY, int nZ, bool bRelaxed ) |
263 | 0 | { |
264 | 0 | if ( nX < 0 ) nX = 0; // only positive values |
265 | 0 | if ( nY < 0 ) nY = 0; |
266 | 0 | if ( nZ < 0 ) nZ = 0; |
267 | 0 | if (0 == std::min({ nX, nY, nZ })) // at least one 0 |
268 | 0 | { |
269 | 0 | int nMid, nMax; |
270 | 0 | nMax = std::max({ nX, nY, nZ }); // either 0 for three 0s or Max |
271 | 0 | if ( 0 == (nMid = Mid3( nX, nY, nZ )) ) // even two 0 |
272 | 0 | nLimit = nMax; // either 0 or the only one >0 |
273 | 0 | else // one is 0 |
274 | 0 | nLimit = std::lcm( nMid, nMax ); |
275 | 0 | } |
276 | 0 | else // all three of them are not 0 |
277 | 0 | nLimit = std::lcm(std::lcm(nX, nY), nZ); |
278 | 0 | nRepP0 = ( nX ? nLimit / nX : nLimit + 1 ); |
279 | 0 | nInsQ0 = ( nY ? nLimit / nY : nLimit + 1 ); |
280 | 0 | nDelR0 = ( nZ ? nLimit / nZ : nLimit + 1 ); |
281 | 0 | bSplitCount = bRelaxed; |
282 | 0 | } |
283 | | |
284 | | // The value in the middle |
285 | | int WLevDistance::Mid3( int x, int y, int z ) |
286 | 0 | { |
287 | 0 | int min = std::min({ x, y, z }); |
288 | 0 | if ( x == min ) |
289 | 0 | return std::min(y, z); |
290 | 0 | else if ( y == min ) |
291 | 0 | return std::min(x, z); |
292 | 0 | else // z == min |
293 | 0 | return std::min(x, y); |
294 | 0 | } |
295 | | |
296 | | // initialize data from CTOR |
297 | | void WLevDistance::InitData( const sal_Unicode* cPattern ) |
298 | 0 | { |
299 | 0 | cpPattern = aPatMem.GetcPtr(); |
300 | 0 | bpPatIsWild = aPatMem.GetbPtr(); |
301 | 0 | npDistance = aDisMem.GetPtr(); |
302 | 0 | nStars = 0; |
303 | 0 | const sal_Unicode* cp1 = cPattern; |
304 | 0 | sal_Unicode* cp2 = cpPattern; |
305 | 0 | bool* bp = bpPatIsWild; |
306 | | // copy pattern, count asterisks, escaped Jokers |
307 | 0 | while ( *cp1 ) |
308 | 0 | { |
309 | 0 | if ( *cp1 == '\\' ) // maybe escaped |
310 | 0 | { |
311 | 0 | if ( *(cp1+1) == '*' || *(cp1+1) == '?' ) // next Joker? |
312 | 0 | { |
313 | 0 | cp1++; // skip '\\' |
314 | 0 | nPatternLen--; |
315 | 0 | } |
316 | 0 | *bp++ = false; |
317 | 0 | } |
318 | 0 | else if ( *cp1 == '*' || *cp1 == '?' ) // Joker |
319 | 0 | { |
320 | 0 | if ( *cp1 == '*' ) |
321 | 0 | nStars++; |
322 | 0 | *bp++ = true; |
323 | 0 | } |
324 | 0 | else |
325 | 0 | *bp++ = false; |
326 | 0 | *cp2++ = *cp1++; |
327 | 0 | } |
328 | 0 | *cp2 = '\0'; |
329 | 0 | } |
330 | | |
331 | | WLevDistance::WLevDistance( const sal_Unicode* cPattern, |
332 | | int nOtherX, int nShorterY, int nLongerZ, |
333 | | bool bRelaxed ) : |
334 | 0 | nPatternLen( Impl_WLD_StringLen(cPattern) ), |
335 | 0 | aPatMem( nPatternLen + 1 ), |
336 | 0 | nArrayLen( nPatternLen + 1 ), |
337 | 0 | aDisMem( nArrayLen ) |
338 | 0 | { |
339 | 0 | InitData( cPattern ); |
340 | 0 | CalcLPQR( nOtherX, nShorterY, nLongerZ, bRelaxed ); |
341 | 0 | } |
342 | | |
343 | | // CopyCTor |
344 | | WLevDistance::WLevDistance( const WLevDistance& rWLD ) : |
345 | 0 | nPatternLen( rWLD.nPatternLen ), |
346 | 0 | aPatMem( nPatternLen + 1 ), |
347 | 0 | nArrayLen( nPatternLen + 1 ), |
348 | 0 | aDisMem( nArrayLen ), |
349 | 0 | nLimit( rWLD.nLimit ), |
350 | 0 | nRepP0( rWLD.nRepP0 ), |
351 | 0 | nInsQ0( rWLD.nInsQ0 ), |
352 | 0 | nDelR0( rWLD.nDelR0 ), |
353 | 0 | nStars( rWLD.nStars ), |
354 | 0 | bSplitCount( rWLD.bSplitCount ) |
355 | 0 | { |
356 | 0 | cpPattern = aPatMem.GetcPtr(); |
357 | 0 | bpPatIsWild = aPatMem.GetbPtr(); |
358 | 0 | npDistance = aDisMem.GetPtr(); |
359 | 0 | sal_Int32 i; |
360 | 0 | for ( i=0; i<nPatternLen; i++ ) |
361 | 0 | { |
362 | 0 | cpPattern[i] = rWLD.cpPattern[i]; |
363 | 0 | bpPatIsWild[i] = rWLD.bpPatIsWild[i]; |
364 | 0 | } |
365 | 0 | cpPattern[i] = '\0'; |
366 | 0 | } |
367 | | |
368 | | // DTor |
369 | | WLevDistance::~WLevDistance() |
370 | 0 | { |
371 | 0 | } |
372 | | |
373 | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |