/src/gettext-0.26/gettext-tools/libgettextpo/fstrcmp.c
Line | Count | Source |
1 | | /* Functions to make fuzzy comparisons between strings |
2 | | Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006, 2008-2025 Free |
3 | | Software Foundation, Inc. |
4 | | |
5 | | This program is free software: you can redistribute it and/or modify |
6 | | it under the terms of the GNU General Public License as published by |
7 | | the Free Software Foundation, either version 3 of the License, or |
8 | | (at your option) any later version. |
9 | | |
10 | | This program is distributed in the hope that it will be useful, |
11 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | GNU General Public License for more details. |
14 | | |
15 | | You should have received a copy of the GNU General Public License |
16 | | along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
17 | | |
18 | | |
19 | | #include <config.h> |
20 | | |
21 | | /* Specification. */ |
22 | | #include "fstrcmp.h" |
23 | | |
24 | | #include <string.h> |
25 | | #include <stddef.h> |
26 | | #include <stdio.h> |
27 | | #include <stdint.h> |
28 | | #include <stdlib.h> |
29 | | #include <limits.h> |
30 | | |
31 | | #include "glthread/once.h" |
32 | | #include "glthread/tls.h" |
33 | | #include "minmax.h" |
34 | | #include "xalloc.h" |
35 | | |
36 | | |
37 | 0 | #define ELEMENT char |
38 | 0 | #define EQUAL(x,y) ((x) == (y)) |
39 | 0 | #define OFFSET ptrdiff_t |
40 | 0 | #define OFFSET_MAX PTRDIFF_MAX |
41 | | #define EXTRA_CONTEXT_FIELDS \ |
42 | | /* The number of edits beyond which the computation can be aborted. */ \ |
43 | | ptrdiff_t edit_count_limit; \ |
44 | | /* The number of edits (= number of elements inserted, plus the number of \ |
45 | | elements deleted), temporarily minus edit_count_limit. */ \ |
46 | | ptrdiff_t edit_count; |
47 | 0 | #define NOTE_DELETE(ctxt, xoff) ctxt->edit_count++ |
48 | 0 | #define NOTE_INSERT(ctxt, yoff) ctxt->edit_count++ |
49 | 0 | #define NOTE_ORDERED false |
50 | 0 | #define EARLY_ABORT(ctxt) ctxt->edit_count > 0 |
51 | | /* We don't need USE_HEURISTIC, since it is unlikely in typical uses of |
52 | | fstrcmp(). */ |
53 | | #include "diffseq.h" |
54 | | |
55 | | |
56 | | /* Because fstrcmp is typically called multiple times, attempt to minimize |
57 | | the number of memory allocations performed. Thus, let a call reuse the |
58 | | memory already allocated by the previous call, if it is sufficient. |
59 | | To make it multithread-safe, without need for a lock that protects the |
60 | | already allocated memory, store the allocated memory per thread. Free |
61 | | it only when the thread exits. */ |
62 | | |
63 | | static gl_tls_key_t buffer_key; /* TLS key for a 'ptrdiff_t *' */ |
64 | | static gl_tls_key_t bufmax_key; /* TLS key for a 'uintptr_t' */ |
65 | | |
66 | | static void |
67 | | keys_init (void) |
68 | 0 | { |
69 | 0 | gl_tls_key_init (buffer_key, free); |
70 | 0 | gl_tls_key_init (bufmax_key, NULL); |
71 | | /* The per-thread initial values are NULL and 0, respectively. */ |
72 | 0 | } |
73 | | |
74 | | /* Ensure that keys_init is called once only. */ |
75 | | gl_once_define(static, keys_init_once) |
76 | | |
77 | | void |
78 | | fstrcmp_free_resources (void) |
79 | 0 | { |
80 | 0 | ptrdiff_t *buffer; |
81 | |
|
82 | 0 | gl_once (keys_init_once, keys_init); |
83 | 0 | buffer = gl_tls_get (buffer_key); |
84 | 0 | if (buffer != NULL) |
85 | 0 | { |
86 | 0 | gl_tls_set (buffer_key, NULL); |
87 | 0 | gl_tls_set (bufmax_key, (void *) (uintptr_t) 0); |
88 | 0 | free (buffer); |
89 | 0 | } |
90 | 0 | } |
91 | | |
92 | | |
93 | | /* In the code below, branch probabilities were measured by Ralf Wildenhues, |
94 | | by running "msgmerge LL.po coreutils.pot" with msgmerge 0.18 for many |
95 | | values of LL. The probability indicates that the condition evaluates |
96 | | to true; whether that leads to a branch or a non-branch in the code, |
97 | | depends on the compiler's reordering of basic blocks. */ |
98 | | |
99 | | |
100 | | double |
101 | | fstrcmp_bounded (const char *string1, const char *string2, double lower_bound) |
102 | 0 | { |
103 | 0 | struct context ctxt; |
104 | 0 | size_t xvec_length = strlen (string1); |
105 | 0 | size_t yvec_length = strlen (string2); |
106 | 0 | size_t length_sum = xvec_length + yvec_length; |
107 | 0 | ptrdiff_t i; |
108 | |
|
109 | 0 | ptrdiff_t fdiag_len; |
110 | 0 | ptrdiff_t *buffer; |
111 | 0 | uintptr_t bufmax; |
112 | | |
113 | | /* short-circuit obvious comparisons */ |
114 | 0 | if (xvec_length == 0 || yvec_length == 0) /* Prob: 1% */ |
115 | 0 | return length_sum == 0; |
116 | | |
117 | 0 | if (! (xvec_length <= length_sum |
118 | 0 | && length_sum <= MIN (UINTPTR_MAX, PTRDIFF_MAX) - 3)) |
119 | 0 | xalloc_die (); |
120 | |
|
121 | 0 | if (lower_bound > 0) |
122 | 0 | { |
123 | | /* Compute a quick upper bound. |
124 | | Each edit is an insertion or deletion of an element, hence modifies |
125 | | the length of the sequence by at most 1. |
126 | | Therefore, when starting from a sequence X and ending at a sequence Y, |
127 | | with N edits, | yvec_length - xvec_length | <= N. (Proof by |
128 | | induction over N.) |
129 | | So, at the end, we will have |
130 | | edit_count >= | xvec_length - yvec_length |. |
131 | | and hence |
132 | | result |
133 | | = (xvec_length + yvec_length - edit_count) |
134 | | / (xvec_length + yvec_length) |
135 | | <= (xvec_length + yvec_length - | yvec_length - xvec_length |) |
136 | | / (xvec_length + yvec_length) |
137 | | = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length). |
138 | | */ |
139 | 0 | ptrdiff_t length_min = MIN (xvec_length, yvec_length); |
140 | 0 | volatile double upper_bound = 2.0 * length_min / length_sum; |
141 | |
|
142 | 0 | if (upper_bound < lower_bound) /* Prob: 74% */ |
143 | | /* Return an arbitrary value < LOWER_BOUND. */ |
144 | 0 | return 0.0; |
145 | | |
146 | 0 | #if CHAR_BIT <= 8 |
147 | | /* When X and Y are both small, avoid the overhead of setting up an |
148 | | array of size 256. */ |
149 | 0 | if (length_sum >= 20) /* Prob: 99% */ |
150 | 0 | { |
151 | | /* Compute a less quick upper bound. |
152 | | Each edit is an insertion or deletion of a character, hence |
153 | | modifies the occurrence count of a character by 1 and leaves the |
154 | | other occurrence counts unchanged. |
155 | | Therefore, when starting from a sequence X and ending at a |
156 | | sequence Y, and denoting the occurrence count of C in X with |
157 | | OCC (X, C), with N edits, |
158 | | sum_C | OCC (X, C) - OCC (Y, C) | <= N. |
159 | | (Proof by induction over N.) |
160 | | So, at the end, we will have |
161 | | edit_count >= sum_C | OCC (X, C) - OCC (Y, C) |, |
162 | | and hence |
163 | | result |
164 | | = (xvec_length + yvec_length - edit_count) |
165 | | / (xvec_length + yvec_length) |
166 | | <= (xvec_length + yvec_length - sum_C | OCC(X,C) - OCC(Y,C) |) |
167 | | / (xvec_length + yvec_length). |
168 | | */ |
169 | 0 | ptrdiff_t occ_diff[UCHAR_MAX + 1]; /* array C -> OCC(X,C) - OCC(Y,C) */ |
170 | 0 | ptrdiff_t sum; |
171 | 0 | double dsum; |
172 | | |
173 | | /* Determine the occurrence counts in X. */ |
174 | 0 | memset (occ_diff, 0, sizeof (occ_diff)); |
175 | 0 | for (i = xvec_length - 1; i >= 0; i--) |
176 | 0 | occ_diff[(unsigned char) string1[i]]++; |
177 | | /* Subtract the occurrence counts in Y. */ |
178 | 0 | for (i = yvec_length - 1; i >= 0; i--) |
179 | 0 | occ_diff[(unsigned char) string2[i]]--; |
180 | | /* Sum up the absolute values. */ |
181 | 0 | sum = 0; |
182 | 0 | for (i = 0; i <= UCHAR_MAX; i++) |
183 | 0 | { |
184 | 0 | ptrdiff_t d = occ_diff[i]; |
185 | 0 | sum += (d >= 0 ? d : -d); |
186 | 0 | } |
187 | |
|
188 | 0 | dsum = sum; |
189 | 0 | upper_bound = 1.0 - dsum / length_sum; |
190 | |
|
191 | 0 | if (upper_bound < lower_bound) /* Prob: 66% */ |
192 | | /* Return an arbitrary value < LOWER_BOUND. */ |
193 | 0 | return 0.0; |
194 | 0 | } |
195 | 0 | #endif |
196 | 0 | } |
197 | | |
198 | | /* set the info for each string. */ |
199 | 0 | ctxt.xvec = string1; |
200 | 0 | ctxt.yvec = string2; |
201 | | |
202 | | /* Set TOO_EXPENSIVE to be approximate square root of input size, |
203 | | bounded below by 4096. */ |
204 | 0 | ctxt.too_expensive = 1; |
205 | 0 | for (i = xvec_length + yvec_length; i != 0; i >>= 2) |
206 | 0 | ctxt.too_expensive <<= 1; |
207 | 0 | if (ctxt.too_expensive < 4096) |
208 | 0 | ctxt.too_expensive = 4096; |
209 | | |
210 | | /* Allocate memory for fdiag and bdiag from a thread-local pool. */ |
211 | 0 | fdiag_len = length_sum + 3; |
212 | 0 | gl_once (keys_init_once, keys_init); |
213 | 0 | buffer = gl_tls_get (buffer_key); |
214 | 0 | bufmax = (uintptr_t) gl_tls_get (bufmax_key); |
215 | 0 | if (fdiag_len > bufmax) |
216 | 0 | { |
217 | | /* Need more memory. */ |
218 | 0 | bufmax = 2 * bufmax; |
219 | 0 | if (fdiag_len > bufmax) |
220 | 0 | bufmax = fdiag_len; |
221 | | /* Calling xrealloc would be a waste: buffer's contents does not need |
222 | | to be preserved. */ |
223 | 0 | free (buffer); |
224 | 0 | buffer = xnmalloc (bufmax, 2 * sizeof *buffer); |
225 | 0 | gl_tls_set (buffer_key, buffer); |
226 | 0 | gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax); |
227 | 0 | } |
228 | 0 | ctxt.fdiag = buffer + yvec_length + 1; |
229 | 0 | ctxt.bdiag = ctxt.fdiag + fdiag_len; |
230 | | |
231 | | /* The edit_count is only ever increased. The computation can be aborted |
232 | | when |
233 | | (xvec_length + yvec_length - edit_count) / (xvec_length + yvec_length) |
234 | | < lower_bound, |
235 | | or equivalently |
236 | | edit_count > (xvec_length + yvec_length) * (1 - lower_bound) |
237 | | or equivalently |
238 | | edit_count > floor((xvec_length + yvec_length) * (1 - lower_bound)). |
239 | | We need to add an epsilon inside the floor(...) argument, to neutralize |
240 | | rounding errors. */ |
241 | 0 | ctxt.edit_count_limit = |
242 | 0 | (lower_bound < 1.0 |
243 | 0 | ? (ptrdiff_t) (length_sum * (1.0 - lower_bound + 0.000001)) |
244 | 0 | : 0); |
245 | | |
246 | | /* Now do the main comparison algorithm */ |
247 | 0 | ctxt.edit_count = - ctxt.edit_count_limit; |
248 | 0 | if (compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt)) /* Prob: 98% */ |
249 | | /* The edit_count passed the limit. Hence the result would be |
250 | | < lower_bound. We can return any value < lower_bound instead. */ |
251 | 0 | return 0.0; |
252 | 0 | ctxt.edit_count += ctxt.edit_count_limit; |
253 | | |
254 | | /* The result is |
255 | | ((number of chars in common) / (average length of the strings)). |
256 | | The numerator is |
257 | | = xvec_length - (number of calls to NOTE_DELETE) |
258 | | = yvec_length - (number of calls to NOTE_INSERT) |
259 | | = 1/2 * (xvec_length + yvec_length - (number of edits)). |
260 | | This is admittedly biased towards finding that the strings are |
261 | | similar, however it does produce meaningful results. */ |
262 | 0 | return ((double) (xvec_length + yvec_length - ctxt.edit_count) |
263 | 0 | / (xvec_length + yvec_length)); |
264 | 0 | } |