Coverage Report

Created: 2025-11-16 09:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libreoffice/i18npool/source/search/levdis.cxx
Line
Count
Source
1
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
/*
3
 * This file is part of the LibreOffice project.
4
 *
5
 * This Source Code Form is subject to the terms of the Mozilla Public
6
 * License, v. 2.0. If a copy of the MPL was not distributed with this
7
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8
 *
9
 * This file incorporates work covered by the following license notice:
10
 *
11
 *   Licensed to the Apache Software Foundation (ASF) under one or more
12
 *   contributor license agreements. See the NOTICE file distributed
13
 *   with this work for additional information regarding copyright
14
 *   ownership. The ASF licenses this file to you under the Apache
15
 *   License, Version 2.0 (the "License"); you may not use this file
16
 *   except in compliance with the License. You may obtain a copy of
17
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18
 */
19
20
/*
21
22
    Weighted Levenshtein Distance
23
    including wildcards
24
    '*' for any number (0 or more) of arbitrary characters
25
    '?' for exactly one arbitrary character
26
    escapable with  backslash, "\*" or "\?"
27
28
    Return:
29
        WLD if WLD <= nLimit, else nLimit+1
30
31
    or, if bSplitCount:
32
        WLD if WLD <= nLimit
33
        -WLD if Replace and Insert and Delete <= nLimit
34
        else nLimit+1
35
36
    Recursive definition of WLD:
37
38
    WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
39
                             WLD( X(i)  , Y(j-1) ) + q      ,
40
                             WLD( X(i-1), Y(j)   ) + r      )
41
42
    X(i)   := the first i characters of the word X
43
    Y(j)   := the first j characters of the word Y
44
    p(i,j) := 0 if i-th character of X == j-th character of Y,
45
              p else
46
47
    Boundary conditions:
48
    WLD( X(0), Y(j) ) := j*q  (Y created by j inserts)
49
    WLD( X(i), Y(0) ) := i*r  (Y created by i deletes)
50
    WLD( X(0), Y(0) ) := 0
51
52
    Instead of recursions a dynamic algorithm is used.
53
54
    See also: German computer magazine
55
    c't 07/89 pages 192-208 and c't 03/94 pages 230-239
56
*/
57
58
#include <algorithm>
59
#include <numeric>
60
61
#include "levdis.hxx"
62
63
0
#define LEVDISBIG   (nLimit + 1)    // Return value if distance > nLimit
64
0
#define LEVDISDOUBLEBUF 2048        // no doubling atop this border
65
66
static sal_Int32 Impl_WLD_StringLen( const sal_Unicode* pStr )
67
0
{
68
0
    const sal_Unicode* pTempStr = pStr;
69
0
    while( *pTempStr )
70
0
        pTempStr++;
71
0
    return static_cast<sal_Int32>(pTempStr-pStr);
72
0
}
73
74
// Distance from string to pattern
75
int WLevDistance::WLD( const sal_Unicode* cString, sal_Int32 nStringLen )
76
0
{
77
0
    int nSPMin = 0;     // penalty point Minimum
78
0
    int nRepS = 0;      // for SplitCount
79
80
    // length difference between pattern and string
81
0
    int nLenDiff = nPatternLen - nStars - nStringLen;
82
    // more insertions or deletions necessary as the limit? Then leave
83
0
    if ( (nLenDiff * nInsQ0 > nLimit)
84
0
            || ((nStars == 0) && (nLenDiff * nDelR0 < -nLimit)) )
85
0
        return LEVDISBIG;
86
87
     // comparative String greater than  instantaneous array
88
    // -> adapt array size
89
0
    if ( nStringLen >= nArrayLen )
90
0
    {
91
        // increase size much more to avoid reallocation
92
0
        if ( nStringLen < LEVDISDOUBLEBUF )
93
0
            nArrayLen = 2 * nStringLen;
94
0
        else
95
0
            nArrayLen = nStringLen + 1;
96
0
        npDistance = aDisMem.NewMem( nArrayLen );
97
0
    }
98
99
    // Calculate start values of the second column (first pattern value).
100
    // First column (0-Len pattern) is always zero .. nStringLen * nInsQ0,
101
    // therefore the minimum is 0
102
0
    if ( nPatternLen == 0 )
103
0
    {
104
        // Count of deletions to reach pattern
105
0
        for ( sal_Int32 i=0; i <= nStringLen; i++ )
106
0
            npDistance[i] = i * nDelR0;
107
0
    }
108
0
    else if ( cpPattern[0] == '*' && bpPatIsWild[0] )
109
0
    {
110
        // instead of a '*' you can fit in anything
111
0
        for ( sal_Int32 i=0; i <= nStringLen; i++ )
112
0
            npDistance[i] = 0;
113
0
    }
114
0
    else
115
0
    {
116
0
        sal_Unicode c;
117
0
        int nP;
118
0
        c = cpPattern[0];
119
0
        if ( c == '?' && bpPatIsWild[0] )
120
0
            nP = 0;     // a '?' could be any character.
121
0
        else
122
            // Minimum of replacement and deletion+insertion weighting
123
0
            nP = std::min({ nRepP0, nRepP0, nDelR0 + nInsQ0 });
124
0
        npDistance[0] = nInsQ0;     // start with simple insert
125
0
        npDistance[1] = nInsQ0;
126
0
        npDistance[2] = nInsQ0;
127
0
        int nReplacePos = -1;       // tristate flag
128
0
        int nDelCnt = 0;
129
0
        for ( sal_Int32 i=1; i <= nStringLen; i++, nDelCnt += nDelR0 )
130
0
        {
131
0
            if ( cString[i-1] == c )
132
0
                nP = 0;     // Replace from this position is 0
133
            // Deletions to match pattern + Replace
134
0
            npDistance[i] = nDelCnt + nP;
135
0
            if ( bSplitCount )
136
0
            {
137
0
                if ( nReplacePos < 0 && nP )
138
0
                {   // this position will be replaced
139
0
                    nRepS++;
140
0
                    nReplacePos = i;
141
0
                }
142
0
                else if ( nReplacePos > 0 && !nP )
143
0
                {
144
                    // same count of c
145
0
                    int nBalance = levdisbalance( 0, i-1, c, cString, nStringLen );
146
0
                    if ( !nBalance )
147
0
                    {   // one was replaced that was an insertion instead
148
0
                        nRepS--;
149
0
                        nReplacePos = 0;
150
0
                    }
151
0
                }
152
0
            }
153
0
        }
154
0
        nSPMin = std::min({ npDistance[0], npDistance[1], npDistance[2] });
155
0
    }
156
157
    // calculate distance matrix
158
0
    sal_Int32 j = 0;        // for all columns of the pattern, till limit is not reached
159
0
    while ( (j < nPatternLen-1)
160
0
            && nSPMin <= (bSplitCount ? 2 * nLimit : nLimit) )
161
0
    {
162
0
        sal_Unicode c;
163
0
        int nP, nQ, nR, nPij, d2;
164
165
0
        j++;
166
0
        c = cpPattern[j];
167
0
        if ( bpPatIsWild[j] )   // '*' or '?' not escaped
168
0
            nP = 0;     // could be replaced without penalty
169
0
        else
170
0
            nP = nRepP0;
171
0
        if ( c == '*' && bpPatIsWild[j] )
172
0
        {
173
0
            nQ = 0;     // insertion and deletion without penalty
174
0
            nR = 0;
175
0
        }
176
0
        else
177
0
        {
178
0
            nQ = nInsQ0;    // usual weighting
179
0
            nR = nDelR0;
180
0
        }
181
0
        d2 = npDistance[0];
182
        // increase insert count to get from null string to pattern
183
0
        npDistance[0] = npDistance[0] + nQ;
184
0
        nSPMin = npDistance[0];
185
0
        int nReplacePos = -1;       // tristate flag
186
        // for each pattern column run through the string
187
0
        for ( sal_Int32 i=1; i <= nStringLen; i++ )
188
0
        {
189
0
            int d1 = d2;            // WLD( X(i-1), Y(j-1) )
190
0
            d2 = npDistance[i];     // WLD( X(i)  , Y(j-1) )
191
0
            if ( cString[i-1] == c )
192
0
            {
193
0
                nPij = 0;           // p(i,j)
194
0
                if ( nReplacePos < 0 )
195
0
                {
196
                    // same count of c
197
0
                    int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
198
0
                    if ( !nBalance )
199
0
                        nReplacePos = 0;    // no replacement
200
0
                }
201
0
            }
202
0
            else
203
0
                nPij = nP;
204
            // WLD( X(i), Y(j) ) = min( WLD( X(i-1), Y(j-1) ) + p(i,j) ,
205
            //                          WLD( X(i)  , Y(j-1) ) + q      ,
206
            //                          WLD( X(i-1), Y(j)   ) + r      )
207
0
            npDistance[i] = std::min({ d1 + nPij, d2 + nQ, npDistance[i-1] + nR });
208
0
            if ( npDistance[i] < nSPMin )
209
0
                nSPMin = npDistance[i];
210
0
            if ( bSplitCount )
211
0
            {
212
0
                if ( nReplacePos < 0 && nPij && npDistance[i] == d1 + nPij )
213
0
                {   // this position will be replaced
214
0
                    nRepS++;
215
0
                    nReplacePos = i;
216
0
                }
217
0
                else if ( nReplacePos > 0 && !nPij )
218
0
                {
219
                    // character is equal in string and pattern
220
                    //
221
                    // If from this point:
222
                    // * pattern and string have the same count of this
223
                    //   character
224
                    // * and character count is the same before this position
225
                    // then the replace was none.
226
                    //
227
                    // Scrambled letters are recognized here and the nRepS
228
                    // replacement is withdrawn, whereby the double limit kicks
229
                    // in.
230
231
                    // Same count of c
232
0
                    int nBalance = levdisbalance( j, i-1, c, cString, nStringLen );
233
0
                    if ( !nBalance )
234
0
                    {   // one was replaced that was an insertion instead
235
0
                        nRepS--;
236
0
                        nReplacePos = 0;
237
0
                    }
238
0
                }
239
0
            }
240
0
        }
241
0
    }
242
0
    if ( (nSPMin <= nLimit) && (npDistance[nStringLen] <= nLimit) )
243
0
        return npDistance[nStringLen];
244
0
    else
245
0
    {
246
0
        if ( bSplitCount )
247
0
        {
248
0
            if ( nRepS && nLenDiff > 0 )
249
0
                nRepS -= nLenDiff;      // Inserts were counted
250
0
            if ( (nSPMin <= 2 * nLimit)
251
0
                    && (npDistance[nStringLen] <= 2 * nLimit)
252
0
                    && (nRepS * nRepP0 <= nLimit) )
253
0
                return -npDistance[nStringLen];
254
0
            return LEVDISBIG;
255
0
        }
256
0
        return LEVDISBIG;
257
0
    }
258
0
}
259
260
// Calculating      nLimit,   nReplP0,    nInsQ0,     nDelR0,     bSplitCount
261
// from user values           nOtherX,    nShorterY,  nLongerZ,   bRelaxed
262
void WLevDistance::CalcLPQR( int nX, int nY, int nZ, bool bRelaxed )
263
0
{
264
0
    if ( nX < 0 ) nX = 0;       // only positive values
265
0
    if ( nY < 0 ) nY = 0;
266
0
    if ( nZ < 0 ) nZ = 0;
267
0
    if (0 == std::min({ nX, nY, nZ })) // at least one 0
268
0
    {
269
0
        int nMid, nMax;
270
0
        nMax = std::max({ nX, nY, nZ }); // either 0 for three 0s or Max
271
0
        if ( 0 == (nMid = Mid3( nX, nY, nZ )) ) // even two 0
272
0
            nLimit = nMax;                      // either 0 or the only one >0
273
0
        else                                    // one is 0
274
0
            nLimit = std::lcm( nMid, nMax );
275
0
    }
276
0
    else                                        // all three of them are not 0
277
0
        nLimit = std::lcm(std::lcm(nX, nY), nZ);
278
0
    nRepP0 = ( nX ? nLimit / nX : nLimit + 1 );
279
0
    nInsQ0 = ( nY ? nLimit / nY : nLimit + 1 );
280
0
    nDelR0 = ( nZ ? nLimit / nZ : nLimit + 1 );
281
0
    bSplitCount = bRelaxed;
282
0
}
283
284
// The value in the middle
285
int WLevDistance::Mid3( int x, int y, int z )
286
0
{
287
0
    int min = std::min({ x, y, z });
288
0
    if ( x == min )
289
0
        return std::min(y, z);
290
0
    else if ( y == min )
291
0
        return std::min(x, z);
292
0
    else        // z == min
293
0
        return std::min(x, y);
294
0
}
295
296
// initialize data from CTOR
297
void WLevDistance::InitData( const sal_Unicode* cPattern )
298
0
{
299
0
    cpPattern = aPatMem.GetcPtr();
300
0
    bpPatIsWild = aPatMem.GetbPtr();
301
0
    npDistance = aDisMem.GetPtr();
302
0
    nStars = 0;
303
0
    const sal_Unicode* cp1 = cPattern;
304
0
    sal_Unicode* cp2 = cpPattern;
305
0
    bool* bp = bpPatIsWild;
306
    // copy pattern, count asterisks, escaped Jokers
307
0
    while ( *cp1 )
308
0
    {
309
0
        if ( *cp1 == '\\' )     // maybe escaped
310
0
        {
311
0
            if ( *(cp1+1) == '*' || *(cp1+1) == '?' )   // next Joker?
312
0
            {
313
0
                cp1++;          // skip '\\'
314
0
                nPatternLen--;
315
0
            }
316
0
            *bp++ = false;
317
0
        }
318
0
        else if ( *cp1 == '*' || *cp1 == '?' )      // Joker
319
0
        {
320
0
            if ( *cp1 == '*' )
321
0
                nStars++;
322
0
            *bp++ = true;
323
0
        }
324
0
        else
325
0
            *bp++ = false;
326
0
        *cp2++ = *cp1++;
327
0
    }
328
0
    *cp2 = '\0';
329
0
}
330
331
WLevDistance::WLevDistance( const sal_Unicode* cPattern,
332
                            int nOtherX, int nShorterY, int nLongerZ,
333
                            bool bRelaxed ) :
334
0
    nPatternLen( Impl_WLD_StringLen(cPattern) ),
335
0
    aPatMem( nPatternLen + 1 ),
336
0
    nArrayLen( nPatternLen + 1 ),
337
0
    aDisMem( nArrayLen )
338
0
{
339
0
    InitData( cPattern );
340
0
    CalcLPQR( nOtherX, nShorterY, nLongerZ, bRelaxed );
341
0
}
342
343
// CopyCTor
344
WLevDistance::WLevDistance( const WLevDistance& rWLD ) :
345
0
    nPatternLen( rWLD.nPatternLen ),
346
0
    aPatMem( nPatternLen + 1 ),
347
0
    nArrayLen( nPatternLen + 1 ),
348
0
    aDisMem( nArrayLen ),
349
0
    nLimit( rWLD.nLimit ),
350
0
    nRepP0( rWLD.nRepP0 ),
351
0
    nInsQ0( rWLD.nInsQ0 ),
352
0
    nDelR0( rWLD.nDelR0 ),
353
0
    nStars( rWLD.nStars ),
354
0
    bSplitCount( rWLD.bSplitCount )
355
0
{
356
0
    cpPattern = aPatMem.GetcPtr();
357
0
    bpPatIsWild = aPatMem.GetbPtr();
358
0
    npDistance = aDisMem.GetPtr();
359
0
    sal_Int32 i;
360
0
    for ( i=0; i<nPatternLen; i++ )
361
0
    {
362
0
        cpPattern[i] = rWLD.cpPattern[i];
363
0
        bpPatIsWild[i] = rWLD.bpPatIsWild[i];
364
0
    }
365
0
    cpPattern[i] = '\0';
366
0
}
367
368
// DTor
369
WLevDistance::~WLevDistance()
370
0
{
371
0
}
372
373
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */