Coverage Report

Created: 2025-12-11 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aspell/modules/speller/default/leditdist.cpp
Line
Count
Source
1
2
#include "leditdist.hpp"
3
4
// The basic algorithm is as follows:
5
//
6
// Let A[n] represent the nth character of string n
7
//     A[n..] represent the substring of A starting at n
8
//            if n > length of A then it is considered an empty string
9
//
10
// edit_distance(A,B,limit) = ed(A,B,0)
11
//   where ed(A,B,d) = d                              if A & B is empty.
12
//                   = infinity                       if d > limit
13
//                   = ed(A[2..],B[2..], d)           if A[1] == B[1] 
14
//                   = min ( ed(A[2..],B[2..], d+1),  
15
//                           ed(A,     B[2..], d+1),
16
//                           ed(A[2..],B,      d+1) ) otherwise
17
//
18
// However, the code below:
19
// 1) Also allows for swaps
20
// 2) Allow weights to be attached to each edit
21
// 3) Is not recursive, it uses a loop when it is tail recursion
22
//    and a small stack otherwise.  The stack will NEVER be larger
23
//    then 2 * limit.
24
// 4) Is extremely optimized
25
26
27
#define check_rest(a,b,s)   \
28
923M
  a0 = a; b0 = b;           \
29
986M
  while (*a0 == *b0) {      \
30
67.6M
    if (*a0 == '\0') {      \
31
4.68M
      if (s < min) min = s; \
32
4.68M
      break;                \
33
4.68M
    }                       \
34
67.6M
    ++a0; ++b0;             \
35
62.9M
  }
36
37
namespace aspeller {
38
39
  int limit_edit_distance(const char * a, const char * b, 
40
        int limit, const EditDistanceWeights & w)
41
30.5M
  {
42
30.5M
    limit = limit*w.max;
43
30.5M
    static const int size = 10;
44
30.5M
    struct Edit {
45
30.5M
      const char * a;
46
30.5M
      const char * b;
47
30.5M
      int score;
48
30.5M
    };
49
30.5M
    Edit begin[size];
50
30.5M
    Edit * i = begin;
51
30.5M
    const char * a0;
52
30.5M
    const char * b0;
53
30.5M
    int score = 0;
54
30.5M
    int min = LARGE_NUM;
55
    
56
511M
    while (true) {
57
      
58
555M
      while (*a == *b) {
59
43.9M
  if (*a == '\0') { 
60
199k
    if (score < min) min = score;
61
199k
    goto FINISH;
62
199k
  } 
63
43.7M
  ++a; ++b;
64
43.7M
      }
65
66
511M
      if (*a == '\0') {
67
68
179M
  do {
69
179M
    score += w.del2;
70
179M
    if (score >= min) goto FINISH;
71
159M
    ++b;
72
159M
  } while (*b != '\0');
73
12.0M
  min = score;
74
  
75
479M
      } else if (*b == '\0') {
76
  
77
134M
  do {
78
134M
    score += w.del1;
79
134M
    if (score >= min) goto FINISH;
80
126M
    ++a;
81
126M
  } while (*a != '\0');
82
2.75M
  min = score;
83
  
84
468M
      } else {
85
86
468M
  if (score + w.max <= limit) {
87
468M
    if (limit*w.min <= w.max*(w.min+score)) {
88
      // if floor(score/max)=limit/max-1 then this edit is only good
89
      // if it makes the rest of the string match.  So check if
90
      // the rest of the string matches to avoid the overhead of
91
      // pushing it on then off the stack
92
      
93
      // delete a character from a
94
307M
      check_rest(a+1,b,score + w.del1);
95
      
96
      // delete a character from b
97
307M
      check_rest(a,b+1,score + w.del2);
98
      
99
307M
      if (*a == *(b+1) && *b == *(a+1)) {
100
101
        // swap two characters
102
1.35M
        check_rest(a+2,b+2, score + w.swap);
103
104
306M
      } else {
105
106
        // substitute one character for another which is the same
107
        // thing as deleting a character from both a & b
108
306M
        check_rest(a+1,b+1, score + w.sub);
109
      
110
306M
      }
111
    
112
307M
    } else {
113
    
114
      // delete a character from a
115
160M
      i->a = a + 1;
116
160M
      i->b = b;
117
160M
      i->score = score + w.del1;
118
160M
      ++i;
119
    
120
      // delete a character from b
121
160M
      i->a = a;
122
160M
      i->b = b + 1;
123
160M
      i->score = score + w.del2;
124
160M
      ++i;
125
      
126
      // If two characters can be swapped and make a match 
127
      // then the substitution is pointless.
128
      // Also, there is no need to push this on the stack as
129
      // it is going to be imminently removed.
130
160M
      if (*a == *(b+1) && *b == *(a+1)) {
131
        
132
        // swap two characters
133
971k
        a = a + 2;
134
971k
        b = b + 2;
135
971k
        score += w.swap;
136
971k
        continue;
137
        
138
159M
      } else {
139
        
140
        // substitute one character for another which is the same
141
        // thing as deleting a character from both a & b
142
159M
        a = a + 1;
143
159M
        b = b + 1;
144
159M
        score += w.sub;
145
159M
        continue;
146
      
147
159M
      }
148
160M
    }
149
468M
  }
150
468M
      }
151
351M
    FINISH:
152
351M
      if (i == begin) return min;
153
320M
      --i;
154
320M
      a = i->a;
155
320M
      b = i->b;
156
320M
      score = i->score;
157
320M
    }
158
30.5M
  }
159
160
#undef check_rest
161
#define check_rest(a,b,w)               \
162
454M
          a0 = a; b0 = b;               \
163
515M
          while(*a0 == *b0) {           \
164
66.4M
      if (*a0 == '\0') {          \
165
5.58M
        if (w < min) min = w;     \
166
5.58M
        break;                    \
167
5.58M
      }                           \
168
66.4M
      ++a0;                       \
169
60.8M
      ++b0;                       \
170
60.8M
          }                             \
171
454M
          if (amax < a0) amax = a0;
172
173
#define check2(a,b,w)                                             \
174
141M
  aa = a; bb = b;                                                 \
175
157M
  while(*aa == *bb) {                                             \
176
16.7M
    if (*aa == '\0')  {                                           \
177
293k
      if (amax < aa) amax = aa;                                   \
178
293k
      if (w < min) min = w;                                       \
179
293k
      break;                                                      \
180
293k
    }                                                             \
181
16.7M
    ++aa; ++bb;                                                   \
182
16.4M
  }                                                               \
183
141M
  if (*aa == '\0') {                                              \
184
5.57M
    if (amax < aa) amax = aa;                                     \
185
5.57M
    if (*bb == '\0') {}                                           \
186
5.57M
    else if (*(bb+1) == '\0' && w+ws.del2 < min) min = w+ws.del2; \
187
135M
  } else if (*bb == '\0') {                                       \
188
1.83M
    ++aa;                                                         \
189
1.83M
    if (amax < aa) amax = aa;                                     \
190
1.83M
    if (*aa == '\0' && w+ws.del1 < min) min = w+ws.del1;          \
191
134M
  } else {                                                        \
192
134M
    check_rest(aa+1,bb,w+ws.del1);                                \
193
134M
    check_rest(aa,bb+1,w+ws.del2);                                \
194
134M
    if (*aa == *(bb+1) && *bb == *(aa+1)) {                       \
195
1.50M
      check_rest(aa+2,bb+2,w+ws.swap);                            \
196
132M
    } else {                                                      \
197
132M
      check_rest(aa+1,bb+1,w+ws.sub);                             \
198
132M
    }                                                             \
199
134M
  }
200
201
  EditDist limit0_edit_distance(const char * a, const char * b,
202
        const EditDistanceWeights & ws)
203
1.62M
  {
204
1.90M
    while(*a == *b) {
205
340k
      if (*a == '\0')
206
67.5k
  return EditDist(0, a);
207
273k
      ++a; ++b;
208
273k
    }
209
210
1.56M
    return EditDist(LARGE_NUM, a);
211
1.62M
  }
212
213
  EditDist limit1_edit_distance(const char * a, const char * b,
214
        const EditDistanceWeights & ws)
215
22.5M
  {
216
22.5M
    int min = LARGE_NUM;
217
22.5M
    const char * a0;
218
22.5M
    const char * b0;
219
22.5M
    const char * amax = a;
220
    
221
32.5M
    while(*a == *b) { 
222
10.0M
      if (*a == '\0') 
223
56.2k
  return EditDist(0, a);
224
10.0M
      ++a; ++b;
225
10.0M
    }
226
227
22.5M
    if (*a == '\0') {
228
      
229
68.9k
      ++b;
230
68.9k
      if (*b == '\0') return EditDist(ws.del2, a);
231
21.3k
      return EditDist(LARGE_NUM, a);
232
      
233
22.4M
    } else if (*b == '\0') {
234
235
4.93M
      ++a;
236
4.93M
      if (*a == '\0') return EditDist(ws.del1, a);
237
3.25M
      return EditDist(LARGE_NUM, a);
238
      
239
17.5M
    } else {
240
      
241
      // delete a character from a
242
17.5M
      check_rest(a+1,b,ws.del1);
243
      
244
      // delete a character from b
245
17.5M
      check_rest(a,b+1,ws.del2);
246
247
17.5M
      if (*a == *(b+1) && *b == *(a+1)) {
248
  
249
  // swap two characters
250
239k
  check_rest(a+2,b+2,ws.swap);
251
252
17.2M
      } else {
253
  
254
  // substitute one character for another which is the same
255
  // thing as deleting a character from both a & b
256
17.2M
  check_rest(a+1,b+1,ws.sub);
257
  
258
17.2M
      }
259
17.5M
    }
260
17.5M
    return EditDist(min, amax);
261
22.5M
  }
262
263
  EditDist limit2_edit_distance(const char * a, const char * b,
264
        const EditDistanceWeights & ws)
265
50.0M
  {
266
50.0M
    int min = LARGE_NUM;
267
50.0M
    const char * a0;
268
50.0M
    const char * b0;
269
50.0M
    const char * aa;
270
50.0M
    const char * bb;
271
50.0M
    const char * amax = a;
272
    
273
60.8M
    while(*a == *b) { 
274
10.7M
      if (*a == '\0') 
275
1.24k
  return EditDist(0, a);
276
10.7M
      ++a; ++b;
277
10.7M
    }
278
279
50.0M
    if (*a == '\0') {
280
      
281
2.40M
      ++b;
282
2.40M
      if (*b == '\0') return EditDist(ws.del2, a);
283
2.23M
      ++b;
284
2.23M
      if (*b == '\0') return EditDist(2*ws.del2, a);
285
1.58M
      return EditDist(LARGE_NUM, a);
286
      
287
47.6M
    } else if (*b == '\0') {
288
289
526k
      ++a;
290
526k
      if (*a == '\0') return EditDist(ws.del1, a);
291
508k
      ++a;
292
508k
      if (*a == '\0') return EditDist(2*ws.del1, a);
293
360k
      return EditDist(LARGE_NUM, a);
294
      
295
47.1M
    } else {
296
      
297
      // delete a character from a
298
47.1M
      check2(a+1,b,ws.del1);
299
      
300
      // delete a character from b
301
47.1M
      check2(a,b+1,ws.del2);
302
303
47.1M
      if (*a == *(b+1) && *b == *(a+1)) {
304
  
305
  // swap two characters
306
413k
  check2(a+2,b+2,ws.swap);
307
308
46.7M
      } else {
309
  
310
  // substitute one character for another which is the same
311
  // thing as deleting a character from both a & b
312
46.7M
  check2(a+1,b+1,ws.sub);
313
  
314
46.7M
      }
315
47.1M
    }
316
47.1M
    return EditDist(min, amax);
317
50.0M
  }
318
}
319
320