/src/aspell/modules/speller/default/leditdist.cpp
Line | Count | Source |
1 | | |
2 | | #include "leditdist.hpp" |
3 | | |
4 | | // The basic algorithm is as follows: |
5 | | // |
6 | | // Let A[n] represent the nth character of string n |
7 | | // A[n..] represent the substring of A starting at n |
8 | | // if n > length of A then it is considered an empty string |
9 | | // |
10 | | // edit_distance(A,B,limit) = ed(A,B,0) |
11 | | // where ed(A,B,d) = d if A & B is empty. |
12 | | // = infinity if d > limit |
13 | | // = ed(A[2..],B[2..], d) if A[1] == B[1] |
14 | | // = min ( ed(A[2..],B[2..], d+1), |
15 | | // ed(A, B[2..], d+1), |
16 | | // ed(A[2..],B, d+1) ) otherwise |
17 | | // |
18 | | // However, the code below: |
19 | | // 1) Also allows for swaps |
20 | | // 2) Allow weights to be attached to each edit |
21 | | // 3) Is not recursive, it uses a loop when it is tail recursion |
22 | | // and a small stack otherwise. The stack will NEVER be larger |
23 | | // then 2 * limit. |
24 | | // 4) Is extremely optimized |
25 | | |
26 | | |
27 | | #define check_rest(a,b,s) \ |
28 | 806M | a0 = a; b0 = b; \ |
29 | 852M | while (*a0 == *b0) { \ |
30 | 49.2M | if (*a0 == '\0') { \ |
31 | 3.15M | if (s < min) min = s; \ |
32 | 3.15M | break; \ |
33 | 3.15M | } \ |
34 | 49.2M | ++a0; ++b0; \ |
35 | 46.1M | } |
36 | | |
37 | | namespace aspeller { |
38 | | |
39 | | int limit_edit_distance(const char * a, const char * b, |
40 | | int limit, const EditDistanceWeights & w) |
41 | 26.2M | { |
42 | 26.2M | limit = limit*w.max; |
43 | 26.2M | static const int size = 10; |
44 | 26.2M | struct Edit { |
45 | 26.2M | const char * a; |
46 | 26.2M | const char * b; |
47 | 26.2M | int score; |
48 | 26.2M | }; |
49 | 26.2M | Edit begin[size]; |
50 | 26.2M | Edit * i = begin; |
51 | 26.2M | const char * a0; |
52 | 26.2M | const char * b0; |
53 | 26.2M | int score = 0; |
54 | 26.2M | int min = LARGE_NUM; |
55 | | |
56 | 438M | while (true) { |
57 | | |
58 | 465M | while (*a == *b) { |
59 | 27.2M | if (*a == '\0') { |
60 | 250k | if (score < min) min = score; |
61 | 250k | goto FINISH; |
62 | 250k | } |
63 | 27.0M | ++a; ++b; |
64 | 27.0M | } |
65 | | |
66 | 438M | if (*a == '\0') { |
67 | | |
68 | 100M | do { |
69 | 100M | score += w.del2; |
70 | 100M | if (score >= min) goto FINISH; |
71 | 90.2M | ++b; |
72 | 90.2M | } while (*b != '\0'); |
73 | 8.33M | min = score; |
74 | | |
75 | 419M | } else if (*b == '\0') { |
76 | | |
77 | 123M | do { |
78 | 123M | score += w.del1; |
79 | 123M | if (score >= min) goto FINISH; |
80 | 112M | ++a; |
81 | 112M | } while (*a != '\0'); |
82 | 2.65M | min = score; |
83 | | |
84 | 406M | } else { |
85 | | |
86 | 406M | if (score + w.max <= limit) { |
87 | 406M | if (limit*w.min <= w.max*(w.min+score)) { |
88 | | // if floor(score/max)=limit/max-1 then this edit is only good |
89 | | // if it makes the rest of the string match. So check if |
90 | | // the rest of the string matches to avoid the overhead of |
91 | | // pushing it on then off the stack |
92 | | |
93 | | // delete a character from a |
94 | 268M | check_rest(a+1,b,score + w.del1); |
95 | | |
96 | | // delete a character from b |
97 | 268M | check_rest(a,b+1,score + w.del2); |
98 | | |
99 | 268M | if (*a == *(b+1) && *b == *(a+1)) { |
100 | | |
101 | | // swap two characters |
102 | 836k | check_rest(a+2,b+2, score + w.swap); |
103 | | |
104 | 267M | } else { |
105 | | |
106 | | // substitute one character for another which is the same |
107 | | // thing as deleting a character from both a & b |
108 | 267M | check_rest(a+1,b+1, score + w.sub); |
109 | | |
110 | 267M | } |
111 | | |
112 | 268M | } else { |
113 | | |
114 | | // delete a character from a |
115 | 137M | i->a = a + 1; |
116 | 137M | i->b = b; |
117 | 137M | i->score = score + w.del1; |
118 | 137M | ++i; |
119 | | |
120 | | // delete a character from b |
121 | 137M | i->a = a; |
122 | 137M | i->b = b + 1; |
123 | 137M | i->score = score + w.del2; |
124 | 137M | ++i; |
125 | | |
126 | | // If two characters can be swapped and make a match |
127 | | // then the substitution is pointless. |
128 | | // Also, there is no need to push this on the stack as |
129 | | // it is going to be imminently removed. |
130 | 137M | if (*a == *(b+1) && *b == *(a+1)) { |
131 | | |
132 | | // swap two characters |
133 | 563k | a = a + 2; |
134 | 563k | b = b + 2; |
135 | 563k | score += w.swap; |
136 | 563k | continue; |
137 | | |
138 | 136M | } else { |
139 | | |
140 | | // substitute one character for another which is the same |
141 | | // thing as deleting a character from both a & b |
142 | 136M | a = a + 1; |
143 | 136M | b = b + 1; |
144 | 136M | score += w.sub; |
145 | 136M | continue; |
146 | | |
147 | 136M | } |
148 | 137M | } |
149 | 406M | } |
150 | 406M | } |
151 | 301M | FINISH: |
152 | 301M | if (i == begin) return min; |
153 | 275M | --i; |
154 | 275M | a = i->a; |
155 | 275M | b = i->b; |
156 | 275M | score = i->score; |
157 | 275M | } |
158 | 26.2M | } |
159 | | |
160 | | #undef check_rest |
161 | | #define check_rest(a,b,w) \ |
162 | 425M | a0 = a; b0 = b; \ |
163 | 485M | while(*a0 == *b0) { \ |
164 | 65.2M | if (*a0 == '\0') { \ |
165 | 5.39M | if (w < min) min = w; \ |
166 | 5.39M | break; \ |
167 | 5.39M | } \ |
168 | 65.2M | ++a0; \ |
169 | 59.8M | ++b0; \ |
170 | 59.8M | } \ |
171 | 425M | if (amax < a0) amax = a0; |
172 | | |
173 | | #define check2(a,b,w) \ |
174 | 133M | aa = a; bb = b; \ |
175 | 149M | while(*aa == *bb) { \ |
176 | 15.8M | if (*aa == '\0') { \ |
177 | 324k | if (amax < aa) amax = aa; \ |
178 | 324k | if (w < min) min = w; \ |
179 | 324k | break; \ |
180 | 324k | } \ |
181 | 15.8M | ++aa; ++bb; \ |
182 | 15.5M | } \ |
183 | 133M | if (*aa == '\0') { \ |
184 | 6.54M | if (amax < aa) amax = aa; \ |
185 | 6.54M | if (*bb == '\0') {} \ |
186 | 6.54M | else if (*(bb+1) == '\0' && w+ws.del2 < min) min = w+ws.del2; \ |
187 | 127M | } else if (*bb == '\0') { \ |
188 | 2.10M | ++aa; \ |
189 | 2.10M | if (amax < aa) amax = aa; \ |
190 | 2.10M | if (*aa == '\0' && w+ws.del1 < min) min = w+ws.del1; \ |
191 | 125M | } else { \ |
192 | 125M | check_rest(aa+1,bb,w+ws.del1); \ |
193 | 125M | check_rest(aa,bb+1,w+ws.del2); \ |
194 | 125M | if (*aa == *(bb+1) && *bb == *(aa+1)) { \ |
195 | 1.00M | check_rest(aa+2,bb+2,w+ws.swap); \ |
196 | 124M | } else { \ |
197 | 124M | check_rest(aa+1,bb+1,w+ws.sub); \ |
198 | 124M | } \ |
199 | 125M | } |
200 | | |
201 | | EditDist limit0_edit_distance(const char * a, const char * b, |
202 | | const EditDistanceWeights & ws) |
203 | 1.18M | { |
204 | 1.43M | while(*a == *b) { |
205 | 296k | if (*a == '\0') |
206 | 48.2k | return EditDist(0, a); |
207 | 248k | ++a; ++b; |
208 | 248k | } |
209 | | |
210 | 1.13M | return EditDist(LARGE_NUM, a); |
211 | 1.18M | } |
212 | | |
213 | | EditDist limit1_edit_distance(const char * a, const char * b, |
214 | | const EditDistanceWeights & ws) |
215 | 22.3M | { |
216 | 22.3M | int min = LARGE_NUM; |
217 | 22.3M | const char * a0; |
218 | 22.3M | const char * b0; |
219 | 22.3M | const char * amax = a; |
220 | | |
221 | 32.3M | while(*a == *b) { |
222 | 10.0M | if (*a == '\0') |
223 | 60.4k | return EditDist(0, a); |
224 | 9.96M | ++a; ++b; |
225 | 9.96M | } |
226 | | |
227 | 22.2M | if (*a == '\0') { |
228 | | |
229 | 58.0k | ++b; |
230 | 58.0k | if (*b == '\0') return EditDist(ws.del2, a); |
231 | 21.3k | return EditDist(LARGE_NUM, a); |
232 | | |
233 | 22.2M | } else if (*b == '\0') { |
234 | | |
235 | 5.63M | ++a; |
236 | 5.63M | if (*a == '\0') return EditDist(ws.del1, a); |
237 | 3.96M | return EditDist(LARGE_NUM, a); |
238 | | |
239 | 16.5M | } else { |
240 | | |
241 | | // delete a character from a |
242 | 16.5M | check_rest(a+1,b,ws.del1); |
243 | | |
244 | | // delete a character from b |
245 | 16.5M | check_rest(a,b+1,ws.del2); |
246 | | |
247 | 16.5M | if (*a == *(b+1) && *b == *(a+1)) { |
248 | | |
249 | | // swap two characters |
250 | 215k | check_rest(a+2,b+2,ws.swap); |
251 | | |
252 | 16.3M | } else { |
253 | | |
254 | | // substitute one character for another which is the same |
255 | | // thing as deleting a character from both a & b |
256 | 16.3M | check_rest(a+1,b+1,ws.sub); |
257 | | |
258 | 16.3M | } |
259 | 16.5M | } |
260 | 16.5M | return EditDist(min, amax); |
261 | 22.2M | } |
262 | | |
263 | | EditDist limit2_edit_distance(const char * a, const char * b, |
264 | | const EditDistanceWeights & ws) |
265 | 49.8M | { |
266 | 49.8M | int min = LARGE_NUM; |
267 | 49.8M | const char * a0; |
268 | 49.8M | const char * b0; |
269 | 49.8M | const char * aa; |
270 | 49.8M | const char * bb; |
271 | 49.8M | const char * amax = a; |
272 | | |
273 | 59.7M | while(*a == *b) { |
274 | 9.84M | if (*a == '\0') |
275 | 814 | return EditDist(0, a); |
276 | 9.84M | ++a; ++b; |
277 | 9.84M | } |
278 | | |
279 | 49.8M | if (*a == '\0') { |
280 | | |
281 | 3.14M | ++b; |
282 | 3.14M | if (*b == '\0') return EditDist(ws.del2, a); |
283 | 2.91M | ++b; |
284 | 2.91M | if (*b == '\0') return EditDist(2*ws.del2, a); |
285 | 2.06M | return EditDist(LARGE_NUM, a); |
286 | | |
287 | 46.7M | } else if (*b == '\0') { |
288 | | |
289 | 2.08M | ++a; |
290 | 2.08M | if (*a == '\0') return EditDist(ws.del1, a); |
291 | 2.04M | ++a; |
292 | 2.04M | if (*a == '\0') return EditDist(2*ws.del1, a); |
293 | 1.46M | return EditDist(LARGE_NUM, a); |
294 | | |
295 | 44.6M | } else { |
296 | | |
297 | | // delete a character from a |
298 | 44.6M | check2(a+1,b,ws.del1); |
299 | | |
300 | | // delete a character from b |
301 | 44.6M | check2(a,b+1,ws.del2); |
302 | | |
303 | 44.6M | if (*a == *(b+1) && *b == *(a+1)) { |
304 | | |
305 | | // swap two characters |
306 | 297k | check2(a+2,b+2,ws.swap); |
307 | | |
308 | 44.3M | } else { |
309 | | |
310 | | // substitute one character for another which is the same |
311 | | // thing as deleting a character from both a & b |
312 | 44.3M | check2(a+1,b+1,ws.sub); |
313 | | |
314 | 44.3M | } |
315 | 44.6M | } |
316 | 44.6M | return EditDist(min, amax); |
317 | 49.8M | } |
318 | | } |
319 | | |
320 | | |