Coverage Report

Created: 2025-10-27 06:46

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/aspell/modules/speller/default/leditdist.cpp
Line
Count
Source
1
2
#include "leditdist.hpp"
3
4
// The basic algorithm is as follows:
5
//
6
// Let A[n] represent the nth character of string n
7
//     A[n..] represent the substring of A starting at n
8
//            if n > length of A then it is considered an empty string
9
//
10
// edit_distance(A,B,limit) = ed(A,B,0)
11
//   where ed(A,B,d) = d                              if A & B is empty.
12
//                   = infinity                       if d > limit
13
//                   = ed(A[2..],B[2..], d)           if A[1] == B[1] 
14
//                   = min ( ed(A[2..],B[2..], d+1),  
15
//                           ed(A,     B[2..], d+1),
16
//                           ed(A[2..],B,      d+1) ) otherwise
17
//
18
// However, the code below:
19
// 1) Also allows for swaps
20
// 2) Allow weights to be attached to each edit
21
// 3) Is not recursive, it uses a loop when it is tail recursion
22
//    and a small stack otherwise.  The stack will NEVER be larger
23
//    then 2 * limit.
24
// 4) Is extremely optimized
25
26
27
#define check_rest(a,b,s)   \
28
454M
  a0 = a; b0 = b;           \
29
487M
  while (*a0 == *b0) {      \
30
36.3M
    if (*a0 == '\0') {      \
31
2.83M
      if (s < min) min = s; \
32
2.83M
      break;                \
33
2.83M
    }                       \
34
36.3M
    ++a0; ++b0;             \
35
33.5M
  }
36
37
namespace aspeller {
38
39
  int limit_edit_distance(const char * a, const char * b, 
40
        int limit, const EditDistanceWeights & w)
41
16.9M
  {
42
16.9M
    limit = limit*w.max;
43
16.9M
    static const int size = 10;
44
16.9M
    struct Edit {
45
16.9M
      const char * a;
46
16.9M
      const char * b;
47
16.9M
      int score;
48
16.9M
    };
49
16.9M
    Edit begin[size];
50
16.9M
    Edit * i = begin;
51
16.9M
    const char * a0;
52
16.9M
    const char * b0;
53
16.9M
    int score = 0;
54
16.9M
    int min = LARGE_NUM;
55
    
56
260M
    while (true) {
57
      
58
285M
      while (*a == *b) {
59
25.3M
  if (*a == '\0') { 
60
373k
    if (score < min) min = score;
61
373k
    goto FINISH;
62
373k
  } 
63
24.9M
  ++a; ++b;
64
24.9M
      }
65
66
260M
      if (*a == '\0') {
67
68
121M
  do {
69
121M
    score += w.del2;
70
121M
    if (score >= min) goto FINISH;
71
108M
    ++b;
72
108M
  } while (*b != '\0');
73
7.10M
  min = score;
74
  
75
239M
      } else if (*b == '\0') {
76
  
77
55.5M
  do {
78
55.5M
    score += w.del1;
79
55.5M
    if (score >= min) goto FINISH;
80
50.3M
    ++a;
81
50.3M
  } while (*a != '\0');
82
1.65M
  min = score;
83
  
84
232M
      } else {
85
86
232M
  if (score + w.max <= limit) {
87
232M
    if (limit*w.min <= w.max*(w.min+score)) {
88
      // if floor(score/max)=limit/max-1 then this edit is only good
89
      // if it makes the rest of the string match.  So check if
90
      // the rest of the string matches to avoid the overhead of
91
      // pushing it on then off the stack
92
      
93
      // delete a character from a
94
151M
      check_rest(a+1,b,score + w.del1);
95
      
96
      // delete a character from b
97
151M
      check_rest(a,b+1,score + w.del2);
98
      
99
151M
      if (*a == *(b+1) && *b == *(a+1)) {
100
101
        // swap two characters
102
917k
        check_rest(a+2,b+2, score + w.swap);
103
104
150M
      } else {
105
106
        // substitute one character for another which is the same
107
        // thing as deleting a character from both a & b
108
150M
        check_rest(a+1,b+1, score + w.sub);
109
      
110
150M
      }
111
    
112
151M
    } else {
113
    
114
      // delete a character from a
115
81.3M
      i->a = a + 1;
116
81.3M
      i->b = b;
117
81.3M
      i->score = score + w.del1;
118
81.3M
      ++i;
119
    
120
      // delete a character from b
121
81.3M
      i->a = a;
122
81.3M
      i->b = b + 1;
123
81.3M
      i->score = score + w.del2;
124
81.3M
      ++i;
125
      
126
      // If two characters can be swapped and make a match 
127
      // then the substitution is pointless.
128
      // Also, there is no need to push this on the stack as
129
      // it is going to be imminently removed.
130
81.3M
      if (*a == *(b+1) && *b == *(a+1)) {
131
        
132
        // swap two characters
133
724k
        a = a + 2;
134
724k
        b = b + 2;
135
724k
        score += w.swap;
136
724k
        continue;
137
        
138
80.6M
      } else {
139
        
140
        // substitute one character for another which is the same
141
        // thing as deleting a character from both a & b
142
80.6M
        a = a + 1;
143
80.6M
        b = b + 1;
144
80.6M
        score += w.sub;
145
80.6M
        continue;
146
      
147
80.6M
      }
148
81.3M
    }
149
232M
  }
150
232M
      }
151
179M
    FINISH:
152
179M
      if (i == begin) return min;
153
162M
      --i;
154
162M
      a = i->a;
155
162M
      b = i->b;
156
162M
      score = i->score;
157
162M
    }
158
16.9M
  }
159
160
#undef check_rest
161
#define check_rest(a,b,w)               \
162
396M
          a0 = a; b0 = b;               \
163
452M
          while(*a0 == *b0) {           \
164
62.1M
      if (*a0 == '\0') {          \
165
6.29M
        if (w < min) min = w;     \
166
6.29M
        break;                    \
167
6.29M
      }                           \
168
62.1M
      ++a0;                       \
169
55.8M
      ++b0;                       \
170
55.8M
          }                             \
171
396M
          if (amax < a0) amax = a0;
172
173
#define check2(a,b,w)                                             \
174
120M
  aa = a; bb = b;                                                 \
175
135M
  while(*aa == *bb) {                                             \
176
15.2M
    if (*aa == '\0')  {                                           \
177
356k
      if (amax < aa) amax = aa;                                   \
178
356k
      if (w < min) min = w;                                       \
179
356k
      break;                                                      \
180
356k
    }                                                             \
181
15.2M
    ++aa; ++bb;                                                   \
182
14.8M
  }                                                               \
183
120M
  if (*aa == '\0') {                                              \
184
6.56M
    if (amax < aa) amax = aa;                                     \
185
6.56M
    if (*bb == '\0') {}                                           \
186
6.56M
    else if (*(bb+1) == '\0' && w+ws.del2 < min) min = w+ws.del2; \
187
113M
  } else if (*bb == '\0') {                                       \
188
1.71M
    ++aa;                                                         \
189
1.71M
    if (amax < aa) amax = aa;                                     \
190
1.71M
    if (*aa == '\0' && w+ws.del1 < min) min = w+ws.del1;          \
191
111M
  } else {                                                        \
192
111M
    check_rest(aa+1,bb,w+ws.del1);                                \
193
111M
    check_rest(aa,bb+1,w+ws.del2);                                \
194
111M
    if (*aa == *(bb+1) && *bb == *(aa+1)) {                       \
195
1.07M
      check_rest(aa+2,bb+2,w+ws.swap);                            \
196
110M
    } else {                                                      \
197
110M
      check_rest(aa+1,bb+1,w+ws.sub);                             \
198
110M
    }                                                             \
199
111M
  }
200
201
  EditDist limit0_edit_distance(const char * a, const char * b,
202
        const EditDistanceWeights & ws)
203
1.18M
  {
204
1.40M
    while(*a == *b) {
205
256k
      if (*a == '\0')
206
42.9k
  return EditDist(0, a);
207
213k
      ++a; ++b;
208
213k
    }
209
210
1.14M
    return EditDist(LARGE_NUM, a);
211
1.18M
  }
212
213
  EditDist limit1_edit_distance(const char * a, const char * b,
214
        const EditDistanceWeights & ws)
215
25.2M
  {
216
25.2M
    int min = LARGE_NUM;
217
25.2M
    const char * a0;
218
25.2M
    const char * b0;
219
25.2M
    const char * amax = a;
220
    
221
38.0M
    while(*a == *b) { 
222
12.8M
      if (*a == '\0') 
223
72.0k
  return EditDist(0, a);
224
12.7M
      ++a; ++b;
225
12.7M
    }
226
227
25.1M
    if (*a == '\0') {
228
      
229
79.9k
      ++b;
230
79.9k
      if (*b == '\0') return EditDist(ws.del2, a);
231
27.1k
      return EditDist(LARGE_NUM, a);
232
      
233
25.1M
    } else if (*b == '\0') {
234
235
4.82M
      ++a;
236
4.82M
      if (*a == '\0') return EditDist(ws.del1, a);
237
3.44M
      return EditDist(LARGE_NUM, a);
238
      
239
20.2M
    } else {
240
      
241
      // delete a character from a
242
20.2M
      check_rest(a+1,b,ws.del1);
243
      
244
      // delete a character from b
245
20.2M
      check_rest(a,b+1,ws.del2);
246
247
20.2M
      if (*a == *(b+1) && *b == *(a+1)) {
248
  
249
  // swap two characters
250
258k
  check_rest(a+2,b+2,ws.swap);
251
252
20.0M
      } else {
253
  
254
  // substitute one character for another which is the same
255
  // thing as deleting a character from both a & b
256
20.0M
  check_rest(a+1,b+1,ws.sub);
257
  
258
20.0M
      }
259
20.2M
    }
260
20.2M
    return EditDist(min, amax);
261
25.1M
  }
262
263
  EditDist limit2_edit_distance(const char * a, const char * b,
264
        const EditDistanceWeights & ws)
265
41.6M
  {
266
41.6M
    int min = LARGE_NUM;
267
41.6M
    const char * a0;
268
41.6M
    const char * b0;
269
41.6M
    const char * aa;
270
41.6M
    const char * bb;
271
41.6M
    const char * amax = a;
272
    
273
51.1M
    while(*a == *b) { 
274
9.43M
      if (*a == '\0') 
275
441
  return EditDist(0, a);
276
9.43M
      ++a; ++b;
277
9.43M
    }
278
279
41.6M
    if (*a == '\0') {
280
      
281
1.36M
      ++b;
282
1.36M
      if (*b == '\0') return EditDist(ws.del2, a);
283
1.26M
      ++b;
284
1.26M
      if (*b == '\0') return EditDist(2*ws.del2, a);
285
902k
      return EditDist(LARGE_NUM, a);
286
      
287
40.3M
    } else if (*b == '\0') {
288
289
235k
      ++a;
290
235k
      if (*a == '\0') return EditDist(ws.del1, a);
291
224k
      ++a;
292
224k
      if (*a == '\0') return EditDist(2*ws.del1, a);
293
159k
      return EditDist(LARGE_NUM, a);
294
      
295
40.0M
    } else {
296
      
297
      // delete a character from a
298
40.0M
      check2(a+1,b,ws.del1);
299
      
300
      // delete a character from b
301
40.0M
      check2(a,b+1,ws.del2);
302
303
40.0M
      if (*a == *(b+1) && *b == *(a+1)) {
304
  
305
  // swap two characters
306
229k
  check2(a+2,b+2,ws.swap);
307
308
39.8M
      } else {
309
  
310
  // substitute one character for another which is the same
311
  // thing as deleting a character from both a & b
312
39.8M
  check2(a+1,b+1,ws.sub);
313
  
314
39.8M
      }
315
40.0M
    }
316
40.0M
    return EditDist(min, amax);
317
41.6M
  }
318
}
319
320