/src/aspell/modules/speller/default/leditdist.cpp
Line | Count | Source |
1 | | |
2 | | #include "leditdist.hpp" |
3 | | |
4 | | // The basic algorithm is as follows: |
5 | | // |
6 | | // Let A[n] represent the nth character of string n |
7 | | // A[n..] represent the substring of A starting at n |
8 | | // if n > length of A then it is considered an empty string |
9 | | // |
10 | | // edit_distance(A,B,limit) = ed(A,B,0) |
11 | | // where ed(A,B,d) = d if A & B is empty. |
12 | | // = infinity if d > limit |
13 | | // = ed(A[2..],B[2..], d) if A[1] == B[1] |
14 | | // = min ( ed(A[2..],B[2..], d+1), |
15 | | // ed(A, B[2..], d+1), |
16 | | // ed(A[2..],B, d+1) ) otherwise |
17 | | // |
18 | | // However, the code below: |
19 | | // 1) Also allows for swaps |
20 | | // 2) Allow weights to be attached to each edit |
21 | | // 3) Is not recursive, it uses a loop when it is tail recursion |
22 | | // and a small stack otherwise. The stack will NEVER be larger |
23 | | // then 2 * limit. |
24 | | // 4) Is extremely optimized |
25 | | |
26 | | |
27 | | #define check_rest(a,b,s) \ |
28 | 201M | a0 = a; b0 = b; \ |
29 | 212M | while (*a0 == *b0) { \ |
30 | 11.8M | if (*a0 == '\0') { \ |
31 | 980k | if (s < min) min = s; \ |
32 | 980k | break; \ |
33 | 980k | } \ |
34 | 11.8M | ++a0; ++b0; \ |
35 | 10.8M | } |
36 | | |
37 | | namespace aspeller { |
38 | | |
39 | | int limit_edit_distance(const char * a, const char * b, |
40 | | int limit, const EditDistanceWeights & w) |
41 | 7.39M | { |
42 | 7.39M | limit = limit*w.max; |
43 | 7.39M | static const int size = 10; |
44 | 7.39M | struct Edit { |
45 | 7.39M | const char * a; |
46 | 7.39M | const char * b; |
47 | 7.39M | int score; |
48 | 7.39M | }; |
49 | 7.39M | Edit begin[size]; |
50 | 7.39M | Edit * i = begin; |
51 | 7.39M | const char * a0; |
52 | 7.39M | const char * b0; |
53 | 7.39M | int score = 0; |
54 | 7.39M | int min = LARGE_NUM; |
55 | | |
56 | 109M | while (true) { |
57 | | |
58 | 116M | while (*a == *b) { |
59 | 7.89M | if (*a == '\0') { |
60 | 85.3k | if (score < min) min = score; |
61 | 85.3k | goto FINISH; |
62 | 85.3k | } |
63 | 7.80M | ++a; ++b; |
64 | 7.80M | } |
65 | | |
66 | 109M | if (*a == '\0') { |
67 | | |
68 | 25.1M | do { |
69 | 25.1M | score += w.del2; |
70 | 25.1M | if (score >= min) goto FINISH; |
71 | 22.9M | ++b; |
72 | 22.9M | } while (*b != '\0'); |
73 | 2.97M | min = score; |
74 | | |
75 | 103M | } else if (*b == '\0') { |
76 | | |
77 | 25.3M | do { |
78 | 25.3M | score += w.del1; |
79 | 25.3M | if (score >= min) goto FINISH; |
80 | 23.1M | ++a; |
81 | 23.1M | } while (*a != '\0'); |
82 | 551k | min = score; |
83 | | |
84 | 101M | } else { |
85 | | |
86 | 101M | if (score + w.max <= limit) { |
87 | 101M | if (limit*w.min <= w.max*(w.min+score)) { |
88 | | // if floor(score/max)=limit/max-1 then this edit is only good |
89 | | // if it makes the rest of the string match. So check if |
90 | | // the rest of the string matches to avoid the overhead of |
91 | | // pushing it on then off the stack |
92 | | |
93 | | // delete a character from a |
94 | 67.2M | check_rest(a+1,b,score + w.del1); |
95 | | |
96 | | // delete a character from b |
97 | 67.2M | check_rest(a,b+1,score + w.del2); |
98 | | |
99 | 67.2M | if (*a == *(b+1) && *b == *(a+1)) { |
100 | | |
101 | | // swap two characters |
102 | 261k | check_rest(a+2,b+2, score + w.swap); |
103 | | |
104 | 66.9M | } else { |
105 | | |
106 | | // substitute one character for another which is the same |
107 | | // thing as deleting a character from both a & b |
108 | 66.9M | check_rest(a+1,b+1, score + w.sub); |
109 | | |
110 | 66.9M | } |
111 | | |
112 | 67.2M | } else { |
113 | | |
114 | | // delete a character from a |
115 | 33.9M | i->a = a + 1; |
116 | 33.9M | i->b = b; |
117 | 33.9M | i->score = score + w.del1; |
118 | 33.9M | ++i; |
119 | | |
120 | | // delete a character from b |
121 | 33.9M | i->a = a; |
122 | 33.9M | i->b = b + 1; |
123 | 33.9M | i->score = score + w.del2; |
124 | 33.9M | ++i; |
125 | | |
126 | | // If two characters can be swapped and make a match |
127 | | // then the substitution is pointless. |
128 | | // Also, there is no need to push this on the stack as |
129 | | // it is going to be imminently removed. |
130 | 33.9M | if (*a == *(b+1) && *b == *(a+1)) { |
131 | | |
132 | | // swap two characters |
133 | 223k | a = a + 2; |
134 | 223k | b = b + 2; |
135 | 223k | score += w.swap; |
136 | 223k | continue; |
137 | | |
138 | 33.6M | } else { |
139 | | |
140 | | // substitute one character for another which is the same |
141 | | // thing as deleting a character from both a & b |
142 | 33.6M | a = a + 1; |
143 | 33.6M | b = b + 1; |
144 | 33.6M | score += w.sub; |
145 | 33.6M | continue; |
146 | | |
147 | 33.6M | } |
148 | 33.9M | } |
149 | 101M | } |
150 | 101M | } |
151 | 75.2M | FINISH: |
152 | 75.2M | if (i == begin) return min; |
153 | 67.8M | --i; |
154 | 67.8M | a = i->a; |
155 | 67.8M | b = i->b; |
156 | 67.8M | score = i->score; |
157 | 67.8M | } |
158 | 7.39M | } |
159 | | |
160 | | #undef check_rest |
161 | | #define check_rest(a,b,w) \ |
162 | 268M | a0 = a; b0 = b; \ |
163 | 308M | while(*a0 == *b0) { \ |
164 | 42.1M | if (*a0 == '\0') { \ |
165 | 2.48M | if (w < min) min = w; \ |
166 | 2.48M | break; \ |
167 | 2.48M | } \ |
168 | 42.1M | ++a0; \ |
169 | 39.7M | ++b0; \ |
170 | 39.7M | } \ |
171 | 268M | if (amax < a0) amax = a0; |
172 | | |
173 | | #define check2(a,b,w) \ |
174 | 84.5M | aa = a; bb = b; \ |
175 | 94.9M | while(*aa == *bb) { \ |
176 | 10.5M | if (*aa == '\0') { \ |
177 | 128k | if (amax < aa) amax = aa; \ |
178 | 128k | if (w < min) min = w; \ |
179 | 128k | break; \ |
180 | 128k | } \ |
181 | 10.5M | ++aa; ++bb; \ |
182 | 10.3M | } \ |
183 | 84.5M | if (*aa == '\0') { \ |
184 | 3.29M | if (amax < aa) amax = aa; \ |
185 | 3.29M | if (*bb == '\0') {} \ |
186 | 3.29M | else if (*(bb+1) == '\0' && w+ws.del2 < min) min = w+ws.del2; \ |
187 | 81.2M | } else if (*bb == '\0') { \ |
188 | 689k | ++aa; \ |
189 | 689k | if (amax < aa) amax = aa; \ |
190 | 689k | if (*aa == '\0' && w+ws.del1 < min) min = w+ws.del1; \ |
191 | 80.5M | } else { \ |
192 | 80.5M | check_rest(aa+1,bb,w+ws.del1); \ |
193 | 80.5M | check_rest(aa,bb+1,w+ws.del2); \ |
194 | 80.5M | if (*aa == *(bb+1) && *bb == *(aa+1)) { \ |
195 | 579k | check_rest(aa+2,bb+2,w+ws.swap); \ |
196 | 79.9M | } else { \ |
197 | 79.9M | check_rest(aa+1,bb+1,w+ws.sub); \ |
198 | 79.9M | } \ |
199 | 80.5M | } |
200 | | |
201 | | EditDist limit0_edit_distance(const char * a, const char * b, |
202 | | const EditDistanceWeights & ws) |
203 | 264k | { |
204 | 337k | while(*a == *b) { |
205 | 77.4k | if (*a == '\0') |
206 | 4.29k | return EditDist(0, a); |
207 | 73.1k | ++a; ++b; |
208 | 73.1k | } |
209 | | |
210 | 260k | return EditDist(LARGE_NUM, a); |
211 | 264k | } |
212 | | |
213 | | EditDist limit1_edit_distance(const char * a, const char * b, |
214 | | const EditDistanceWeights & ws) |
215 | 11.1M | { |
216 | 11.1M | int min = LARGE_NUM; |
217 | 11.1M | const char * a0; |
218 | 11.1M | const char * b0; |
219 | 11.1M | const char * amax = a; |
220 | | |
221 | 15.4M | while(*a == *b) { |
222 | 4.32M | if (*a == '\0') |
223 | 20.3k | return EditDist(0, a); |
224 | 4.30M | ++a; ++b; |
225 | 4.30M | } |
226 | | |
227 | 11.0M | if (*a == '\0') { |
228 | | |
229 | 33.6k | ++b; |
230 | 33.6k | if (*b == '\0') return EditDist(ws.del2, a); |
231 | 21.2k | return EditDist(LARGE_NUM, a); |
232 | | |
233 | 11.0M | } else if (*b == '\0') { |
234 | | |
235 | 2.10M | ++a; |
236 | 2.10M | if (*a == '\0') return EditDist(ws.del1, a); |
237 | 1.43M | return EditDist(LARGE_NUM, a); |
238 | | |
239 | 8.95M | } else { |
240 | | |
241 | | // delete a character from a |
242 | 8.95M | check_rest(a+1,b,ws.del1); |
243 | | |
244 | | // delete a character from b |
245 | 8.95M | check_rest(a,b+1,ws.del2); |
246 | | |
247 | 8.95M | if (*a == *(b+1) && *b == *(a+1)) { |
248 | | |
249 | | // swap two characters |
250 | 114k | check_rest(a+2,b+2,ws.swap); |
251 | | |
252 | 8.83M | } else { |
253 | | |
254 | | // substitute one character for another which is the same |
255 | | // thing as deleting a character from both a & b |
256 | 8.83M | check_rest(a+1,b+1,ws.sub); |
257 | | |
258 | 8.83M | } |
259 | 8.95M | } |
260 | 8.95M | return EditDist(min, amax); |
261 | 11.0M | } |
262 | | |
263 | | EditDist limit2_edit_distance(const char * a, const char * b, |
264 | | const EditDistanceWeights & ws) |
265 | 29.9M | { |
266 | 29.9M | int min = LARGE_NUM; |
267 | 29.9M | const char * a0; |
268 | 29.9M | const char * b0; |
269 | 29.9M | const char * aa; |
270 | 29.9M | const char * bb; |
271 | 29.9M | const char * amax = a; |
272 | | |
273 | 36.3M | while(*a == *b) { |
274 | 6.37M | if (*a == '\0') |
275 | 143 | return EditDist(0, a); |
276 | 6.37M | ++a; ++b; |
277 | 6.37M | } |
278 | | |
279 | 29.9M | if (*a == '\0') { |
280 | | |
281 | 1.45M | ++b; |
282 | 1.45M | if (*b == '\0') return EditDist(ws.del2, a); |
283 | 1.34M | ++b; |
284 | 1.34M | if (*b == '\0') return EditDist(2*ws.del2, a); |
285 | 964k | return EditDist(LARGE_NUM, a); |
286 | | |
287 | 28.5M | } else if (*b == '\0') { |
288 | | |
289 | 358k | ++a; |
290 | 358k | if (*a == '\0') return EditDist(ws.del1, a); |
291 | 346k | ++a; |
292 | 346k | if (*a == '\0') return EditDist(2*ws.del1, a); |
293 | 213k | return EditDist(LARGE_NUM, a); |
294 | | |
295 | 28.1M | } else { |
296 | | |
297 | | // delete a character from a |
298 | 28.1M | check2(a+1,b,ws.del1); |
299 | | |
300 | | // delete a character from b |
301 | 28.1M | check2(a,b+1,ws.del2); |
302 | | |
303 | 28.1M | if (*a == *(b+1) && *b == *(a+1)) { |
304 | | |
305 | | // swap two characters |
306 | 144k | check2(a+2,b+2,ws.swap); |
307 | | |
308 | 28.0M | } else { |
309 | | |
310 | | // substitute one character for another which is the same |
311 | | // thing as deleting a character from both a & b |
312 | 28.0M | check2(a+1,b+1,ws.sub); |
313 | | |
314 | 28.0M | } |
315 | 28.1M | } |
316 | 28.1M | return EditDist(min, amax); |
317 | 29.9M | } |
318 | | } |
319 | | |
320 | | |