/src/libreoffice/i18npool/source/search/levdis.hxx
Line | Count | Source |
1 | | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | /* |
3 | | * This file is part of the LibreOffice project. |
4 | | * |
5 | | * This Source Code Form is subject to the terms of the Mozilla Public |
6 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
7 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8 | | * |
9 | | * This file incorporates work covered by the following license notice: |
10 | | * |
11 | | * Licensed to the Apache Software Foundation (ASF) under one or more |
12 | | * contributor license agreements. See the NOTICE file distributed |
13 | | * with this work for additional information regarding copyright |
14 | | * ownership. The ASF licenses this file to you under the Apache |
15 | | * License, Version 2.0 (the "License"); you may not use this file |
16 | | * except in compliance with the License. You may obtain a copy of |
17 | | * the License at http://www.apache.org/licenses/LICENSE-2.0 . |
18 | | */ |
19 | | |
20 | | #pragma once |
21 | | |
22 | | #include <sal/types.h> |
23 | | #include <memory> |
24 | | |
25 | | // Sensible default values for a user interface could be: |
26 | | // LEVDISDEFAULT_XOTHER 2 |
27 | | // Maximum X replacements to match query, found data may be different by X |
28 | | // characters from query. |
29 | | // LEVDISDEFAULT_YSHORTER 1 |
30 | | // Maximum Y insertions to match query, found data may be Y characters |
31 | | // shorter than query. |
32 | | // LEVDISDEFAULT_ZLONGER 3 |
33 | | // Maximum Z deletions to match query, found data may be Z characters |
34 | | // longer than query. |
35 | | // LEVDISDEFAULT_RELAXED TRUE |
36 | | // Use relaxed SplitCount instead of mathematical WLD. |
37 | | // |
38 | | // Joker/wildcards ('?' and '*') of course do not count as |
39 | | // replacement/insertion/deletion. At a '?' a replacement is not counted, for a |
40 | | // '*' the found data may be any number of characters longer than the query. |
41 | | // |
42 | | // Strict mathematical WLD: EITHER maximum X replacements OR Y characters |
43 | | // shorter OR Z characters longer. |
44 | | // Relaxed SplitCount: maximum X replacements AND/OR Y character shorter AND/OR |
45 | | // Z characters longer. Any combination of actions is valid. |
46 | | // |
47 | | // The value range for X,Y,Z is 0..33 to keep the limit within a 16 bit signed |
48 | | // integer, 31*32*33 is the maximum limit, LCM(31,32,33) == 32736. |
49 | | // |
50 | | // The corresponding internal default weigh values for these user interface |
51 | | // values would be: |
52 | | // LEVDISDEFAULTLIMIT 6 |
53 | | // Default nLimit, matches x=2, y=1, z=3, p=3, q=6, r=2 |
54 | | // LEVDISDEFAULT_P0 3 |
55 | | // Default nRepP0, weight of replacements. |
56 | | // LEVDISDEFAULT_Q0 6 |
57 | | // Default nInsQ0, weight of insertions. |
58 | | // LEVDISDEFAULT_R0 2 |
59 | | // Default nDelR0, weight of deletions. |
60 | | |
61 | | // The transformation of user input values to weighs is done using CalcLPQR(). |
62 | | // One caveat, if the WLD reaches nLimit due to nDelR0 (i.e. data string is nZ |
63 | | // characters longer than pattern) then no character can be replaced any more. |
64 | | // This can be circumvented by increasing nX or/and nZ, but of course with the |
65 | | // side effect of being less strict then... or the other solution is to use |
66 | | // relaxed SplitCount (see below), which is the default when using CalcLPQR(). |
67 | | // |
68 | | // Attention: shorter = WLD.Insert, longer = WLD.Delete |
69 | | // |
70 | | // View and counting is always from data string to pattern, a deletion means |
71 | | // that something is deleted from data to match pattern. |
72 | | // |
73 | | // Deletions weigh less in this example because usually less is known than is |
74 | | // searched for. Replacements get middle weight, for example because of |
75 | | // misspellings. Insertions are expensive. |
76 | | // |
77 | | // Another example: P0 = 1, Q0 = 4, R0 = 4, Limit = 3 |
78 | | // Allowed are maximum 4 replacements, no insertion, no deletion. |
79 | | // Matches the user interface values X = 3, Y = 0, Z = 0 |
80 | | // |
81 | | // bSplitCount: if TRUE, Rep/Ins/Del are counted differently. The return value |
82 | | // of WLD() then isn't necessarily the Levenshtein-Distance, but can be |
83 | | // negative (-WLD) if the WLD is greater than nLimit but single values are |
84 | | // within the limits. |
85 | | // For the above default values that could mean: even if the found string is |
86 | | // already 2 characters longer (nLongerZ), 3 replacements (nOtherX) can be made |
87 | | // to reach pattern. Additionally, character swaps count as one replacement. |
88 | | // Mathematically completely incorrect, but meets user expectations ;-) |
89 | | // |
90 | | // Explanation: in the real WLD all actions are withdrawn from a common 100% |
91 | | // pool, if one gets all there's nothing left for others. With bSplitCount |
92 | | // replacements have their own pool. |
93 | | |
94 | | |
95 | | /** "Safe" memory allocation in ctor */ |
96 | | class WLevDisPatternMem |
97 | | { |
98 | | std::unique_ptr<sal_Unicode[]> cp; |
99 | | std::unique_ptr<bool[]> bp; |
100 | | public: |
101 | | explicit WLevDisPatternMem( sal_Int32 s ) |
102 | 0 | : cp(new sal_Unicode[s]) |
103 | 0 | , bp(new bool[s]) |
104 | 0 | { |
105 | 0 | } |
106 | | |
107 | 0 | sal_Unicode* GetcPtr() const { return cp.get(); } |
108 | 0 | bool* GetbPtr() const { return bp.get(); } |
109 | | }; |
110 | | |
111 | | class WLevDisDistanceMem |
112 | | { |
113 | | std::unique_ptr<int[]> p; |
114 | | public: |
115 | | explicit WLevDisDistanceMem( size_t s ) |
116 | 0 | { |
117 | 0 | NewMem(s); |
118 | 0 | } |
119 | 0 | int* GetPtr() const { return p.get(); } |
120 | | int* NewMem( size_t s ) |
121 | 0 | { |
122 | 0 | p.reset(new int[ s<3 ? 3 : s ]); |
123 | 0 | return p.get(); |
124 | 0 | } |
125 | | }; |
126 | | |
127 | | /** Weighted Levenshtein Distance (WLD) |
128 | | |
129 | | For a more detailed explanation see documentation in |
130 | | i18npool/source/search/levdis.hxx |
131 | | */ |
132 | | class WLevDistance |
133 | | { |
134 | | sal_Int32 nPatternLen; ///< length of pattern |
135 | | WLevDisPatternMem aPatMem; ///< manage allocation of pattern array |
136 | | sal_Unicode* cpPattern; ///< pointer to pattern array |
137 | | bool* bpPatIsWild; ///< pointer to bool array whether pattern is wildcard |
138 | | sal_Int32 nArrayLen; ///< length of distance array |
139 | | WLevDisDistanceMem aDisMem; ///< manage allocation of distance array |
140 | | int* npDistance; ///< pointer to distance array |
141 | | int nLimit; ///< WLD limit replacements/insertions/deletions |
142 | | int nRepP0; ///< replacement weigh |
143 | | int nInsQ0; ///< insertion weigh |
144 | | int nDelR0; ///< deletion weigh |
145 | | int nStars; ///< count of '*' wildcards in pattern |
146 | | bool bSplitCount; ///< if TRUE, Rep/Ins/Del are counted separately |
147 | | |
148 | | void InitData( const sal_Unicode* cPattern ); |
149 | | static int Mid3( int x, int y, int z ); ///< middle value of 3 values |
150 | | |
151 | | public: |
152 | | |
153 | | /** CTor with user input. Internally calls CalcLPQR(). |
154 | | |
155 | | After this, obtain the resulting limit using GetLimit(). |
156 | | |
157 | | @param bRelaxed the mathematically incorrect method is default (TRUE) |
158 | | */ |
159 | | WLevDistance( const sal_Unicode* cPattern, int nOtherX, int nShorterY, |
160 | | int nLongerZ, bool bRelaxed ); |
161 | | |
162 | | WLevDistance( const WLevDistance& rWLD ); |
163 | | ~WLevDistance(); |
164 | | |
165 | | /** Calculate the Weighted Levenshtein Distance from string to pattern. */ |
166 | | int WLD( const sal_Unicode* cString, sal_Int32 nStringLen ); |
167 | | |
168 | | /** Calculate the internal weighs corresponding to the user input values. |
169 | | @returns nLimit for later comparison with WLD() |
170 | | */ |
171 | | void CalcLPQR( int nOtherX, int nShorterY, int nLongerZ, |
172 | | bool bRelaxed ); |
173 | | |
174 | 0 | int GetLimit() const { return nLimit; } |
175 | | |
176 | | // Calculate current balance, keep this inline for performance reasons! |
177 | | // c == cpPattern[jj] == cString[ii] |
178 | | // First seek up to found place, if the balance is still equal there then |
179 | | // also compare after the found place. |
180 | | int levdisbalance(sal_Int32 jj, sal_Int32 ii, sal_Unicode c, const sal_Unicode* cString, sal_Int32 nStringLen) const |
181 | 0 | { |
182 | 0 | int nBalance = 0; |
183 | |
|
184 | 0 | if ( jj != ii ) |
185 | 0 | { |
186 | 0 | sal_Int32 k; |
187 | 0 | if ( jj > 0 ) |
188 | 0 | for ( k=0; k < jj; k++ ) |
189 | 0 | if ( cpPattern[k] == c ) |
190 | 0 | nBalance++; |
191 | 0 | if ( ii > 0 ) |
192 | 0 | for ( k=0; k < ii; k++ ) |
193 | 0 | if ( cString[k] == c ) |
194 | 0 | nBalance--; |
195 | 0 | if ( !nBalance ) |
196 | 0 | { |
197 | 0 | for ( k=jj+1; k < nPatternLen; k++ ) |
198 | 0 | if ( cpPattern[k] == c ) |
199 | 0 | nBalance++; |
200 | 0 | for ( k=ii+1; k < nStringLen; k++ ) |
201 | 0 | if ( cString[k] == c ) |
202 | 0 | nBalance--; |
203 | 0 | } |
204 | 0 | } |
205 | |
|
206 | 0 | return nBalance; |
207 | 0 | } |
208 | | }; |
209 | | |
210 | | |
211 | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |