/src/gettext/gettext-tools/libgettextpo/fstrcmp.c
Line | Count | Source |
1 | | /* Functions to make fuzzy comparisons between strings |
2 | | Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006, 2008-2026 Free |
3 | | Software Foundation, Inc. |
4 | | |
5 | | This program is free software: you can redistribute it and/or modify |
6 | | it under the terms of the GNU General Public License as published by |
7 | | the Free Software Foundation, either version 3 of the License, or |
8 | | (at your option) any later version. |
9 | | |
10 | | This program is distributed in the hope that it will be useful, |
11 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | GNU General Public License for more details. |
14 | | |
15 | | You should have received a copy of the GNU General Public License |
16 | | along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
17 | | |
18 | | |
19 | | #include <config.h> |
20 | | |
21 | | /* Specification. */ |
22 | | #include "fstrcmp.h" |
23 | | |
24 | | #include <string.h> |
25 | | #include <stddef.h> |
26 | | #include <stdio.h> |
27 | | #include <stdint.h> |
28 | | #include <stdlib.h> |
29 | | #include <limits.h> |
30 | | |
31 | | #include "glthread/once.h" |
32 | | #include "glthread/tls.h" |
33 | | #include "minmax.h" |
34 | | #include "xalloc.h" |
35 | | |
36 | | |
37 | 0 | #define ELEMENT char |
38 | 0 | #define EQUAL(x,y) ((x) == (y)) |
39 | 0 | #define OFFSET ptrdiff_t |
40 | 0 | #define OFFSET_MAX PTRDIFF_MAX |
41 | | #define EXTRA_CONTEXT_FIELDS \ |
42 | | /* The number of edits beyond which the computation can be aborted. */ \ |
43 | | ptrdiff_t edit_count_limit; \ |
44 | | /* The number of edits (= number of elements inserted, plus the number of \ |
45 | | elements deleted), temporarily minus edit_count_limit. */ \ |
46 | | ptrdiff_t edit_count; |
47 | 0 | #define NOTE_DELETE(ctxt, xoff) ctxt->edit_count++ |
48 | 0 | #define NOTE_INSERT(ctxt, yoff) ctxt->edit_count++ |
49 | 0 | #define NOTE_ORDERED false |
50 | 0 | #define EARLY_ABORT(ctxt) ctxt->edit_count > 0 |
51 | | /* We don't need USE_HEURISTIC, since it is unlikely in typical uses of |
52 | | fstrcmp(). */ |
53 | | #include "diffseq.h" |
54 | | |
55 | | |
56 | | /* Because fstrcmp is typically called multiple times, attempt to minimize |
57 | | the number of memory allocations performed. Thus, let a call reuse the |
58 | | memory already allocated by the previous call, if it is sufficient. |
59 | | To make it multithread-safe, without need for a lock that protects the |
60 | | already allocated memory, store the allocated memory per thread. Free |
61 | | it only when the thread exits. */ |
62 | | |
63 | | static gl_tls_key_t buffer_key; /* TLS key for a 'ptrdiff_t *' */ |
64 | | static gl_tls_key_t bufmax_key; /* TLS key for a 'uintptr_t' */ |
65 | | |
66 | | static void |
67 | | keys_init (void) |
68 | 0 | { |
69 | 0 | gl_tls_key_init (buffer_key, free); |
70 | 0 | gl_tls_key_init (bufmax_key, NULL); |
71 | | /* The per-thread initial values are NULL and 0, respectively. */ |
72 | 0 | } |
73 | | |
74 | | /* Ensure that keys_init is called once only. */ |
75 | | gl_once_define(static, keys_init_once) |
76 | | |
77 | | void |
78 | | fstrcmp_free_resources (void) |
79 | 0 | { |
80 | 0 | gl_once (keys_init_once, keys_init); |
81 | 0 | ptrdiff_t *buffer = gl_tls_get (buffer_key); |
82 | 0 | if (buffer != NULL) |
83 | 0 | { |
84 | 0 | gl_tls_set (buffer_key, NULL); |
85 | 0 | gl_tls_set (bufmax_key, (void *) (uintptr_t) 0); |
86 | 0 | free (buffer); |
87 | 0 | } |
88 | 0 | } |
89 | | |
90 | | |
91 | | /* In the code below, branch probabilities were measured by Ralf Wildenhues, |
92 | | by running "msgmerge LL.po coreutils.pot" with msgmerge 0.18 for many |
93 | | values of LL. The probability indicates that the condition evaluates |
94 | | to true; whether that leads to a branch or a non-branch in the code, |
95 | | depends on the compiler's reordering of basic blocks. */ |
96 | | |
97 | | |
98 | | double |
99 | | fstrcmp_bounded (const char *string1, const char *string2, double lower_bound) |
100 | 0 | { |
101 | 0 | size_t xvec_length = strlen (string1); |
102 | 0 | size_t yvec_length = strlen (string2); |
103 | 0 | size_t length_sum = xvec_length + yvec_length; |
104 | | |
105 | | /* short-circuit obvious comparisons */ |
106 | 0 | if (xvec_length == 0 || yvec_length == 0) /* Prob: 1% */ |
107 | 0 | return length_sum == 0; |
108 | | |
109 | 0 | if (! (xvec_length <= length_sum |
110 | 0 | && length_sum <= MIN (UINTPTR_MAX, PTRDIFF_MAX) - 3)) |
111 | 0 | xalloc_die (); |
112 | |
|
113 | 0 | if (lower_bound > 0) |
114 | 0 | { |
115 | | /* Compute a quick upper bound. |
116 | | Each edit is an insertion or deletion of an element, hence modifies |
117 | | the length of the sequence by at most 1. |
118 | | Therefore, when starting from a sequence X and ending at a sequence Y, |
119 | | with N edits, | yvec_length - xvec_length | <= N. (Proof by |
120 | | induction over N.) |
121 | | So, at the end, we will have |
122 | | edit_count >= | xvec_length - yvec_length |. |
123 | | and hence |
124 | | result |
125 | | = (xvec_length + yvec_length - edit_count) |
126 | | / (xvec_length + yvec_length) |
127 | | <= (xvec_length + yvec_length - | yvec_length - xvec_length |) |
128 | | / (xvec_length + yvec_length) |
129 | | = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length). |
130 | | */ |
131 | 0 | ptrdiff_t length_min = MIN (xvec_length, yvec_length); |
132 | 0 | volatile double upper_bound = 2.0 * length_min / length_sum; |
133 | |
|
134 | 0 | if (upper_bound < lower_bound) /* Prob: 74% */ |
135 | | /* Return an arbitrary value < LOWER_BOUND. */ |
136 | 0 | return 0.0; |
137 | | |
138 | 0 | #if CHAR_BIT <= 8 |
139 | | /* When X and Y are both small, avoid the overhead of setting up an |
140 | | array of size 256. */ |
141 | 0 | if (length_sum >= 20) /* Prob: 99% */ |
142 | 0 | { |
143 | | /* Compute a less quick upper bound. |
144 | | Each edit is an insertion or deletion of a character, hence |
145 | | modifies the occurrence count of a character by 1 and leaves the |
146 | | other occurrence counts unchanged. |
147 | | Therefore, when starting from a sequence X and ending at a |
148 | | sequence Y, and denoting the occurrence count of C in X with |
149 | | OCC (X, C), with N edits, |
150 | | sum_C | OCC (X, C) - OCC (Y, C) | <= N. |
151 | | (Proof by induction over N.) |
152 | | So, at the end, we will have |
153 | | edit_count >= sum_C | OCC (X, C) - OCC (Y, C) |, |
154 | | and hence |
155 | | result |
156 | | = (xvec_length + yvec_length - edit_count) |
157 | | / (xvec_length + yvec_length) |
158 | | <= (xvec_length + yvec_length - sum_C | OCC(X,C) - OCC(Y,C) |) |
159 | | / (xvec_length + yvec_length). |
160 | | */ |
161 | 0 | ptrdiff_t occ_diff[UCHAR_MAX + 1]; /* array C -> OCC(X,C) - OCC(Y,C) */ |
162 | | |
163 | | /* Determine the occurrence counts in X. */ |
164 | 0 | memset (occ_diff, 0, sizeof (occ_diff)); |
165 | 0 | for (ptrdiff_t i = xvec_length - 1; i >= 0; i--) |
166 | 0 | occ_diff[(unsigned char) string1[i]]++; |
167 | | /* Subtract the occurrence counts in Y. */ |
168 | 0 | for (ptrdiff_t i = yvec_length - 1; i >= 0; i--) |
169 | 0 | occ_diff[(unsigned char) string2[i]]--; |
170 | | /* Sum up the absolute values. */ |
171 | 0 | ptrdiff_t sum = 0; |
172 | 0 | for (ptrdiff_t i = 0; i <= UCHAR_MAX; i++) |
173 | 0 | { |
174 | 0 | ptrdiff_t d = occ_diff[i]; |
175 | 0 | sum += (d >= 0 ? d : -d); |
176 | 0 | } |
177 | |
|
178 | 0 | double dsum = sum; |
179 | 0 | upper_bound = 1.0 - dsum / length_sum; |
180 | |
|
181 | 0 | if (upper_bound < lower_bound) /* Prob: 66% */ |
182 | | /* Return an arbitrary value < LOWER_BOUND. */ |
183 | 0 | return 0.0; |
184 | 0 | } |
185 | 0 | #endif |
186 | 0 | } |
187 | | |
188 | | /* set the info for each string. */ |
189 | 0 | struct context ctxt; |
190 | 0 | ctxt.xvec = string1; |
191 | 0 | ctxt.yvec = string2; |
192 | | |
193 | | /* Set TOO_EXPENSIVE to be approximate square root of input size, |
194 | | bounded below by 4096. */ |
195 | 0 | ctxt.too_expensive = 1; |
196 | 0 | for (ptrdiff_t i = xvec_length + yvec_length; i != 0; i >>= 2) |
197 | 0 | ctxt.too_expensive <<= 1; |
198 | 0 | if (ctxt.too_expensive < 4096) |
199 | 0 | ctxt.too_expensive = 4096; |
200 | | |
201 | | /* Allocate memory for fdiag and bdiag from a thread-local pool. */ |
202 | 0 | ptrdiff_t fdiag_len = length_sum + 3; |
203 | 0 | gl_once (keys_init_once, keys_init); |
204 | 0 | ptrdiff_t *buffer = gl_tls_get (buffer_key); |
205 | 0 | uintptr_t bufmax = (uintptr_t) gl_tls_get (bufmax_key); |
206 | 0 | if (fdiag_len > bufmax) |
207 | 0 | { |
208 | | /* Need more memory. */ |
209 | 0 | bufmax = 2 * bufmax; |
210 | 0 | if (fdiag_len > bufmax) |
211 | 0 | bufmax = fdiag_len; |
212 | | /* Calling xrealloc would be a waste: buffer's contents does not need |
213 | | to be preserved. */ |
214 | 0 | free (buffer); |
215 | 0 | buffer = xnmalloc (bufmax, 2 * sizeof *buffer); |
216 | 0 | gl_tls_set (buffer_key, buffer); |
217 | 0 | gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax); |
218 | 0 | } |
219 | 0 | ctxt.fdiag = buffer + yvec_length + 1; |
220 | 0 | ctxt.bdiag = ctxt.fdiag + fdiag_len; |
221 | | |
222 | | /* The edit_count is only ever increased. The computation can be aborted |
223 | | when |
224 | | (xvec_length + yvec_length - edit_count) / (xvec_length + yvec_length) |
225 | | < lower_bound, |
226 | | or equivalently |
227 | | edit_count > (xvec_length + yvec_length) * (1 - lower_bound) |
228 | | or equivalently |
229 | | edit_count > floor((xvec_length + yvec_length) * (1 - lower_bound)). |
230 | | We need to add an epsilon inside the floor(...) argument, to neutralize |
231 | | rounding errors. */ |
232 | 0 | ctxt.edit_count_limit = |
233 | 0 | (lower_bound < 1.0 |
234 | 0 | ? (ptrdiff_t) (length_sum * (1.0 - lower_bound + 0.000001)) |
235 | 0 | : 0); |
236 | | |
237 | | /* Now do the main comparison algorithm */ |
238 | 0 | ctxt.edit_count = - ctxt.edit_count_limit; |
239 | 0 | if (compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt)) /* Prob: 98% */ |
240 | | /* The edit_count passed the limit. Hence the result would be |
241 | | < lower_bound. We can return any value < lower_bound instead. */ |
242 | 0 | return 0.0; |
243 | 0 | ctxt.edit_count += ctxt.edit_count_limit; |
244 | | |
245 | | /* The result is |
246 | | ((number of chars in common) / (average length of the strings)). |
247 | | The numerator is |
248 | | = xvec_length - (number of calls to NOTE_DELETE) |
249 | | = yvec_length - (number of calls to NOTE_INSERT) |
250 | | = 1/2 * (xvec_length + yvec_length - (number of edits)). |
251 | | This is admittedly biased towards finding that the strings are |
252 | | similar, however it does produce meaningful results. */ |
253 | 0 | return ((double) (xvec_length + yvec_length - ctxt.edit_count) |
254 | 0 | / (xvec_length + yvec_length)); |
255 | 0 | } |