Coverage Report

Created: 2025-12-08 09:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libreoffice/i18npool/source/search/levdis.hxx
Line
Count
Source
1
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
/*
3
 * This file is part of the LibreOffice project.
4
 *
5
 * This Source Code Form is subject to the terms of the Mozilla Public
6
 * License, v. 2.0. If a copy of the MPL was not distributed with this
7
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8
 *
9
 * This file incorporates work covered by the following license notice:
10
 *
11
 *   Licensed to the Apache Software Foundation (ASF) under one or more
12
 *   contributor license agreements. See the NOTICE file distributed
13
 *   with this work for additional information regarding copyright
14
 *   ownership. The ASF licenses this file to you under the Apache
15
 *   License, Version 2.0 (the "License"); you may not use this file
16
 *   except in compliance with the License. You may obtain a copy of
17
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18
 */
19
20
#pragma once
21
22
#include <sal/types.h>
23
#include <memory>
24
25
// Sensible default values for a user interface could be:
26
//  LEVDISDEFAULT_XOTHER    2
27
//      Maximum X replacements to match query, found data may be different by X
28
//      characters from query.
29
//  LEVDISDEFAULT_YSHORTER  1
30
//      Maximum Y insertions to match query, found data may be Y characters
31
//      shorter than query.
32
//  LEVDISDEFAULT_ZLONGER   3
33
//      Maximum Z deletions to match query, found data may be Z characters
34
//      longer than query.
35
//  LEVDISDEFAULT_RELAXED   TRUE
36
//      Use relaxed SplitCount instead of mathematical WLD.
37
//
38
// Joker/wildcards ('?' and '*') of course do not count as
39
// replacement/insertion/deletion. At a '?' a replacement is not counted, for a
40
// '*' the found data may be any number of characters longer than the query.
41
//
42
// Strict mathematical WLD: EITHER maximum X replacements OR Y characters
43
// shorter OR Z characters longer.
44
// Relaxed SplitCount: maximum X replacements AND/OR Y character shorter AND/OR
45
// Z characters longer. Any combination of actions is valid.
46
//
47
// The value range for X,Y,Z is 0..33 to keep the limit within a 16 bit signed
48
// integer, 31*32*33 is the maximum limit, LCM(31,32,33) == 32736.
49
//
50
// The corresponding internal default weigh values for these user interface
51
// values would be:
52
//  LEVDISDEFAULTLIMIT  6
53
//      Default nLimit, matches x=2, y=1, z=3, p=3, q=6, r=2
54
//  LEVDISDEFAULT_P0    3
55
//      Default nRepP0, weight of replacements.
56
//  LEVDISDEFAULT_Q0    6
57
//      Default nInsQ0, weight of insertions.
58
//  LEVDISDEFAULT_R0    2
59
//      Default nDelR0, weight of deletions.
60
61
// The transformation of user input values to weighs is done using CalcLPQR().
62
// One caveat, if the WLD reaches nLimit due to nDelR0 (i.e. data string is nZ
63
// characters longer than pattern) then no character can be replaced any more.
64
// This can be circumvented by increasing nX or/and nZ, but of course with the
65
// side effect of being less strict then... or the other solution is to use
66
// relaxed SplitCount (see below), which is the default when using CalcLPQR().
67
//
68
// Attention: shorter = WLD.Insert, longer = WLD.Delete
69
//
70
// View and counting is always from data string to pattern, a deletion means
71
// that something is deleted from data to match pattern.
72
//
73
// Deletions weigh less in this example because usually less is known than is
74
// searched for. Replacements get middle weight, for example because of
75
// misspellings. Insertions are expensive.
76
//
77
// Another example: P0 = 1, Q0 = 4, R0 = 4, Limit = 3
78
// Allowed are maximum 4 replacements, no insertion, no deletion.
79
// Matches the user interface values X = 3, Y = 0, Z = 0
80
//
81
// bSplitCount: if TRUE, Rep/Ins/Del are counted differently. The return value
82
// of WLD() then isn't necessarily the Levenshtein-Distance, but can be
83
// negative (-WLD) if the WLD is greater than nLimit but single values are
84
// within the limits.
85
// For the above default values that could mean: even if the found string is
86
// already 2 characters longer (nLongerZ), 3 replacements (nOtherX) can be made
87
// to reach pattern. Additionally, character swaps count as one replacement.
88
// Mathematically completely incorrect, but meets user expectations ;-)
89
//
90
// Explanation: in the real WLD all actions are withdrawn from a common 100%
91
// pool, if one gets all there's nothing left for others. With bSplitCount
92
// replacements have their own pool.
93
94
95
/** "Safe" memory allocation in ctor */
96
class WLevDisPatternMem
97
{
98
    std::unique_ptr<sal_Unicode[]> cp;
99
    std::unique_ptr<bool[]>        bp;
100
public:
101
    explicit WLevDisPatternMem( sal_Int32 s )
102
0
        : cp(new sal_Unicode[s])
103
0
        , bp(new bool[s])
104
0
    {
105
0
    }
106
107
0
    sal_Unicode* GetcPtr() const        { return cp.get(); }
108
0
    bool* GetbPtr() const               { return bp.get(); }
109
};
110
111
class WLevDisDistanceMem
112
{
113
    std::unique_ptr<int[]> p;
114
public:
115
    explicit WLevDisDistanceMem( size_t s )
116
0
    {
117
0
        NewMem(s);
118
0
    }
119
0
    int* GetPtr() const             { return p.get(); }
120
    int* NewMem( size_t s )
121
0
    {
122
0
        p.reset(new int[ s<3 ? 3 : s ]);
123
0
        return p.get();
124
0
    }
125
};
126
127
/** Weighted Levenshtein Distance (WLD)
128
129
    For a more detailed explanation see documentation in
130
    i18npool/source/search/levdis.hxx
131
 */
132
class WLevDistance
133
{
134
    sal_Int32       nPatternLen;    ///< length of pattern
135
    WLevDisPatternMem   aPatMem;    ///< manage allocation of pattern array
136
    sal_Unicode*    cpPattern;      ///< pointer to pattern array
137
    bool*           bpPatIsWild;    ///< pointer to bool array whether pattern is wildcard
138
    sal_Int32       nArrayLen;      ///< length of distance array
139
    WLevDisDistanceMem  aDisMem;    ///< manage allocation of distance array
140
    int*            npDistance;     ///< pointer to distance array
141
    int             nLimit;         ///< WLD limit replacements/insertions/deletions
142
    int             nRepP0;         ///< replacement weigh
143
    int             nInsQ0;         ///< insertion weigh
144
    int             nDelR0;         ///< deletion weigh
145
    int             nStars;         ///< count of '*' wildcards in pattern
146
    bool            bSplitCount;    ///< if TRUE, Rep/Ins/Del are counted separately
147
148
    void InitData( const sal_Unicode* cPattern );
149
    static int Mid3( int x, int y, int z );        ///< middle value of 3 values
150
151
public:
152
153
    /** CTor with user input. Internally calls CalcLPQR().
154
155
        After this, obtain the resulting limit using GetLimit().
156
157
        @param  bRelaxed    the mathematically incorrect method is default (TRUE)
158
     */
159
    WLevDistance( const sal_Unicode* cPattern, int nOtherX, int nShorterY,
160
                    int nLongerZ, bool bRelaxed );
161
162
    WLevDistance( const WLevDistance& rWLD );
163
    ~WLevDistance();
164
165
    /** Calculate the Weighted Levenshtein Distance from string to pattern. */
166
    int WLD( const sal_Unicode* cString, sal_Int32 nStringLen );
167
168
    /** Calculate the internal weighs corresponding to the user input values.
169
        @returns nLimit for later comparison with WLD()
170
     */
171
    void CalcLPQR( int nOtherX, int nShorterY, int nLongerZ,
172
                    bool bRelaxed );
173
174
0
    int GetLimit() const     { return nLimit; }
175
176
    // Calculate current balance, keep this inline for performance reasons!
177
    // c == cpPattern[jj] == cString[ii]
178
    // First seek up to found place, if the balance is still equal there then
179
    // also compare after the found place.
180
    int levdisbalance(sal_Int32 jj, sal_Int32 ii, sal_Unicode c, const sal_Unicode* cString, sal_Int32 nStringLen) const
181
0
    {
182
0
        int nBalance = 0;
183
184
0
        if ( jj != ii )
185
0
        {
186
0
            sal_Int32 k;
187
0
            if ( jj > 0 )
188
0
                for ( k=0; k < jj; k++ )
189
0
                    if ( cpPattern[k] == c )
190
0
                        nBalance++;
191
0
            if ( ii > 0 )
192
0
                for ( k=0; k < ii; k++ )
193
0
                    if ( cString[k] == c )
194
0
                        nBalance--;
195
0
            if ( !nBalance )
196
0
            {
197
0
                for ( k=jj+1; k < nPatternLen; k++ )
198
0
                    if ( cpPattern[k] == c )
199
0
                        nBalance++;
200
0
                for ( k=ii+1; k < nStringLen; k++ )
201
0
                    if ( cString[k] == c )
202
0
                        nBalance--;
203
0
            }
204
0
        }
205
206
0
        return nBalance;
207
0
    }
208
};
209
210
211
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */