Coverage Report

Created: 2023-12-08 06:59

/src/aspell/modules/speller/default/leditdist.cpp
Line
Count
Source
1
2
#include "leditdist.hpp"
3
4
// The basic algorithm is as follows:
5
//
6
// Let A[n] represent the nth character of string n
7
//     A[n..] represent the substring of A starting at n
8
//            if n > length of A then it is considered an empty string
9
//
10
// edit_distance(A,B,limit) = ed(A,B,0)
11
//   where ed(A,B,d) = d                              if A & B is empty.
12
//                   = infinity                       if d > limit
13
//                   = ed(A[2..],B[2..], d)           if A[1] == B[1] 
14
//                   = min ( ed(A[2..],B[2..], d+1),  
15
//                           ed(A,     B[2..], d+1),
16
//                           ed(A[2..],B,      d+1) ) otherwise
17
//
18
// However, the code below:
19
// 1) Also allows for swaps
20
// 2) Allow weights to be attached to each edit
21
// 3) Is not recursive, it uses a loop when it is tail recursion
22
//    and a small stack otherwise.  The stack will NEVER be larger
23
//    then 2 * limit.
24
// 4) Is extremely optimized
25
26
27
#define check_rest(a,b,s)   \
28
201M
  a0 = a; b0 = b;           \
29
212M
  while (*a0 == *b0) {      \
30
11.8M
    if (*a0 == '\0') {      \
31
980k
      if (s < min) min = s; \
32
980k
      break;                \
33
980k
    }                       \
34
11.8M
    ++a0; ++b0;             \
35
10.8M
  }
36
37
namespace aspeller {
38
39
  int limit_edit_distance(const char * a, const char * b, 
40
        int limit, const EditDistanceWeights & w)
41
7.39M
  {
42
7.39M
    limit = limit*w.max;
43
7.39M
    static const int size = 10;
44
7.39M
    struct Edit {
45
7.39M
      const char * a;
46
7.39M
      const char * b;
47
7.39M
      int score;
48
7.39M
    };
49
7.39M
    Edit begin[size];
50
7.39M
    Edit * i = begin;
51
7.39M
    const char * a0;
52
7.39M
    const char * b0;
53
7.39M
    int score = 0;
54
7.39M
    int min = LARGE_NUM;
55
    
56
109M
    while (true) {
57
      
58
116M
      while (*a == *b) {
59
7.89M
  if (*a == '\0') { 
60
85.3k
    if (score < min) min = score;
61
85.3k
    goto FINISH;
62
85.3k
  } 
63
7.80M
  ++a; ++b;
64
7.80M
      }
65
66
109M
      if (*a == '\0') {
67
68
25.1M
  do {
69
25.1M
    score += w.del2;
70
25.1M
    if (score >= min) goto FINISH;
71
22.9M
    ++b;
72
22.9M
  } while (*b != '\0');
73
2.97M
  min = score;
74
  
75
103M
      } else if (*b == '\0') {
76
  
77
25.3M
  do {
78
25.3M
    score += w.del1;
79
25.3M
    if (score >= min) goto FINISH;
80
23.1M
    ++a;
81
23.1M
  } while (*a != '\0');
82
551k
  min = score;
83
  
84
101M
      } else {
85
86
101M
  if (score + w.max <= limit) {
87
101M
    if (limit*w.min <= w.max*(w.min+score)) {
88
      // if floor(score/max)=limit/max-1 then this edit is only good
89
      // if it makes the rest of the string match.  So check if
90
      // the rest of the string matches to avoid the overhead of
91
      // pushing it on then off the stack
92
      
93
      // delete a character from a
94
67.2M
      check_rest(a+1,b,score + w.del1);
95
      
96
      // delete a character from b
97
67.2M
      check_rest(a,b+1,score + w.del2);
98
      
99
67.2M
      if (*a == *(b+1) && *b == *(a+1)) {
100
101
        // swap two characters
102
261k
        check_rest(a+2,b+2, score + w.swap);
103
104
66.9M
      } else {
105
106
        // substitute one character for another which is the same
107
        // thing as deleting a character from both a & b
108
66.9M
        check_rest(a+1,b+1, score + w.sub);
109
      
110
66.9M
      }
111
    
112
67.2M
    } else {
113
    
114
      // delete a character from a
115
33.9M
      i->a = a + 1;
116
33.9M
      i->b = b;
117
33.9M
      i->score = score + w.del1;
118
33.9M
      ++i;
119
    
120
      // delete a character from b
121
33.9M
      i->a = a;
122
33.9M
      i->b = b + 1;
123
33.9M
      i->score = score + w.del2;
124
33.9M
      ++i;
125
      
126
      // If two characters can be swapped and make a match 
127
      // then the substitution is pointless.
128
      // Also, there is no need to push this on the stack as
129
      // it is going to be imminently removed.
130
33.9M
      if (*a == *(b+1) && *b == *(a+1)) {
131
        
132
        // swap two characters
133
223k
        a = a + 2;
134
223k
        b = b + 2;
135
223k
        score += w.swap;
136
223k
        continue;
137
        
138
33.6M
      } else {
139
        
140
        // substitute one character for another which is the same
141
        // thing as deleting a character from both a & b
142
33.6M
        a = a + 1;
143
33.6M
        b = b + 1;
144
33.6M
        score += w.sub;
145
33.6M
        continue;
146
      
147
33.6M
      }
148
33.9M
    }
149
101M
  }
150
101M
      }
151
75.2M
    FINISH:
152
75.2M
      if (i == begin) return min;
153
67.8M
      --i;
154
67.8M
      a = i->a;
155
67.8M
      b = i->b;
156
67.8M
      score = i->score;
157
67.8M
    }
158
7.39M
  }
159
160
#undef check_rest
161
#define check_rest(a,b,w)               \
162
268M
          a0 = a; b0 = b;               \
163
308M
          while(*a0 == *b0) {           \
164
42.1M
      if (*a0 == '\0') {          \
165
2.48M
        if (w < min) min = w;     \
166
2.48M
        break;                    \
167
2.48M
      }                           \
168
42.1M
      ++a0;                       \
169
39.7M
      ++b0;                       \
170
39.7M
          }                             \
171
268M
          if (amax < a0) amax = a0;
172
173
#define check2(a,b,w)                                             \
174
84.5M
  aa = a; bb = b;                                                 \
175
94.9M
  while(*aa == *bb) {                                             \
176
10.5M
    if (*aa == '\0')  {                                           \
177
128k
      if (amax < aa) amax = aa;                                   \
178
128k
      if (w < min) min = w;                                       \
179
128k
      break;                                                      \
180
128k
    }                                                             \
181
10.5M
    ++aa; ++bb;                                                   \
182
10.3M
  }                                                               \
183
84.5M
  if (*aa == '\0') {                                              \
184
3.29M
    if (amax < aa) amax = aa;                                     \
185
3.29M
    if (*bb == '\0') {}                                           \
186
3.29M
    else if (*(bb+1) == '\0' && w+ws.del2 < min) min = w+ws.del2; \
187
81.2M
  } else if (*bb == '\0') {                                       \
188
689k
    ++aa;                                                         \
189
689k
    if (amax < aa) amax = aa;                                     \
190
689k
    if (*aa == '\0' && w+ws.del1 < min) min = w+ws.del1;          \
191
80.5M
  } else {                                                        \
192
80.5M
    check_rest(aa+1,bb,w+ws.del1);                                \
193
80.5M
    check_rest(aa,bb+1,w+ws.del2);                                \
194
80.5M
    if (*aa == *(bb+1) && *bb == *(aa+1)) {                       \
195
579k
      check_rest(aa+2,bb+2,w+ws.swap);                            \
196
79.9M
    } else {                                                      \
197
79.9M
      check_rest(aa+1,bb+1,w+ws.sub);                             \
198
79.9M
    }                                                             \
199
80.5M
  }
200
201
  EditDist limit0_edit_distance(const char * a, const char * b,
202
        const EditDistanceWeights & ws)
203
264k
  {
204
337k
    while(*a == *b) {
205
77.4k
      if (*a == '\0')
206
4.29k
  return EditDist(0, a);
207
73.1k
      ++a; ++b;
208
73.1k
    }
209
210
260k
    return EditDist(LARGE_NUM, a);
211
264k
  }
212
213
  EditDist limit1_edit_distance(const char * a, const char * b,
214
        const EditDistanceWeights & ws)
215
11.1M
  {
216
11.1M
    int min = LARGE_NUM;
217
11.1M
    const char * a0;
218
11.1M
    const char * b0;
219
11.1M
    const char * amax = a;
220
    
221
15.4M
    while(*a == *b) { 
222
4.32M
      if (*a == '\0') 
223
20.3k
  return EditDist(0, a);
224
4.30M
      ++a; ++b;
225
4.30M
    }
226
227
11.0M
    if (*a == '\0') {
228
      
229
33.6k
      ++b;
230
33.6k
      if (*b == '\0') return EditDist(ws.del2, a);
231
21.2k
      return EditDist(LARGE_NUM, a);
232
      
233
11.0M
    } else if (*b == '\0') {
234
235
2.10M
      ++a;
236
2.10M
      if (*a == '\0') return EditDist(ws.del1, a);
237
1.43M
      return EditDist(LARGE_NUM, a);
238
      
239
8.95M
    } else {
240
      
241
      // delete a character from a
242
8.95M
      check_rest(a+1,b,ws.del1);
243
      
244
      // delete a character from b
245
8.95M
      check_rest(a,b+1,ws.del2);
246
247
8.95M
      if (*a == *(b+1) && *b == *(a+1)) {
248
  
249
  // swap two characters
250
114k
  check_rest(a+2,b+2,ws.swap);
251
252
8.83M
      } else {
253
  
254
  // substitute one character for another which is the same
255
  // thing as deleting a character from both a & b
256
8.83M
  check_rest(a+1,b+1,ws.sub);
257
  
258
8.83M
      }
259
8.95M
    }
260
8.95M
    return EditDist(min, amax);
261
11.0M
  }
262
263
  EditDist limit2_edit_distance(const char * a, const char * b,
264
        const EditDistanceWeights & ws)
265
29.9M
  {
266
29.9M
    int min = LARGE_NUM;
267
29.9M
    const char * a0;
268
29.9M
    const char * b0;
269
29.9M
    const char * aa;
270
29.9M
    const char * bb;
271
29.9M
    const char * amax = a;
272
    
273
36.3M
    while(*a == *b) { 
274
6.37M
      if (*a == '\0') 
275
143
  return EditDist(0, a);
276
6.37M
      ++a; ++b;
277
6.37M
    }
278
279
29.9M
    if (*a == '\0') {
280
      
281
1.45M
      ++b;
282
1.45M
      if (*b == '\0') return EditDist(ws.del2, a);
283
1.34M
      ++b;
284
1.34M
      if (*b == '\0') return EditDist(2*ws.del2, a);
285
964k
      return EditDist(LARGE_NUM, a);
286
      
287
28.5M
    } else if (*b == '\0') {
288
289
358k
      ++a;
290
358k
      if (*a == '\0') return EditDist(ws.del1, a);
291
346k
      ++a;
292
346k
      if (*a == '\0') return EditDist(2*ws.del1, a);
293
213k
      return EditDist(LARGE_NUM, a);
294
      
295
28.1M
    } else {
296
      
297
      // delete a character from a
298
28.1M
      check2(a+1,b,ws.del1);
299
      
300
      // delete a character from b
301
28.1M
      check2(a,b+1,ws.del2);
302
303
28.1M
      if (*a == *(b+1) && *b == *(a+1)) {
304
  
305
  // swap two characters
306
144k
  check2(a+2,b+2,ws.swap);
307
308
28.0M
      } else {
309
  
310
  // substitute one character for another which is the same
311
  // thing as deleting a character from both a & b
312
28.0M
  check2(a+1,b+1,ws.sub);
313
  
314
28.0M
      }
315
28.1M
    }
316
28.1M
    return EditDist(min, amax);
317
29.9M
  }
318
}
319
320