Coverage Report

Created: 2026-03-12 07:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gettext/gettext-tools/libgettextpo/diffseq.h
Line
Count
Source
1
/* Analyze differences between two vectors.
2
3
   Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2026 Free Software
4
   Foundation, Inc.
5
6
   This program is free software: you can redistribute it and/or modify
7
   it under the terms of the GNU General Public License as published by
8
   the Free Software Foundation, either version 3 of the License, or
9
   (at your option) any later version.
10
11
   This program is distributed in the hope that it will be useful,
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
   GNU General Public License for more details.
15
16
   You should have received a copy of the GNU General Public License
17
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
18
19
20
/* The basic idea is to consider two vectors as similar if, when
21
   transforming the first vector into the second vector through a
22
   sequence of edits (inserts and deletes of one element each),
23
   this sequence is short - or equivalently, if the ordered list
24
   of elements that are untouched by these edits is long.  For a
25
   good introduction to the subject, read about the "Levenshtein
26
   distance" in Wikipedia.
27
28
   The basic algorithm is described in:
29
   "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
30
   Algorithmica Vol. 1, 1986, pp. 251-266,
31
   <https://doi.org/10.1007/BF01840446>.
32
   See especially section 4.2, which describes the variation used below.
33
34
   The basic algorithm was independently discovered as described in:
35
   "Algorithms for Approximate String Matching", Esko Ukkonen,
36
   Information and Control Vol. 64, 1985, pp. 100-118,
37
   <https://doi.org/10.1016/S0019-9958(85)80046-2>.
38
39
   Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
40
   heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
41
   at the price of producing suboptimal output for large inputs with
42
   many differences.  */
43
44
/* Before including this file, you need to define:
45
     ELEMENT                 The element type of the vectors being compared.
46
     EQUAL                   A two-argument macro that tests two elements for
47
                             equality.
48
     OFFSET                  A signed integer type sufficient to hold the
49
                             difference between two indices.  Usually
50
                             something like ptrdiff_t.
51
     OFFSET_MAX              (Optional) The maximum value of OFFSET (e.g.,
52
                             PTRDIFF_MAX).  If omitted, it is inferred in a
53
                             way portable to the vast majority of C platforms,
54
                             as they lack padding bits.
55
     EXTRA_CONTEXT_FIELDS    Declarations of fields for 'struct context'.
56
     NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
57
     NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
58
     NOTE_ORDERED            (Optional) A boolean expression saying that
59
                             NOTE_DELETE and NOTE_INSERT calls must be
60
                             issued in offset order.
61
     EARLY_ABORT(ctxt)       (Optional) A boolean expression that triggers an
62
                             early abort of the computation.
63
     USE_HEURISTIC           (Optional) Define if you want to support the
64
                             heuristic for large vectors.
65
66
   It is also possible to use this file with abstract arrays.  In this case,
67
   xvec and yvec are not represented in memory.  They only exist conceptually.
68
   In this case, the list of defines above is amended as follows:
69
     ELEMENT                 Undefined.
70
     EQUAL                   Undefined.
71
     XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)
72
                             A three-argument macro: References xvec[xoff] and
73
                             yvec[yoff] and tests these elements for equality.
74
75
   Before including this file, you also need to include:
76
     #include <limits.h>
77
     #include "minmax.h"
78
 */
79
80
/* This file uses _GL_GNUC_PREREQ.  */
81
#if !_GL_CONFIG_H_INCLUDED
82
 #error "Please include config.h first."
83
#endif
84
85
/* Maximum value of type OFFSET.  */
86
#ifndef OFFSET_MAX
87
# define OFFSET_MAX \
88
   ((((OFFSET) 1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
89
#endif
90
91
/* Default to no early abort.  */
92
#ifndef EARLY_ABORT
93
# define EARLY_ABORT(ctxt) false
94
#endif
95
96
#ifndef NOTE_ORDERED
97
# define NOTE_ORDERED false
98
#endif
99
100
/* Suppress gcc's "...may be used before initialized" warnings,
101
   generated by GCC versions up to at least GCC 15.1.
102
   Likewise for gcc -fanalyzer's "use of uninitialized value" warnings.  */
103
#if _GL_GNUC_PREREQ (4, 7)
104
# pragma GCC diagnostic push
105
# pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
106
# if _GL_GNUC_PREREQ (13, 0)
107
#  pragma GCC diagnostic ignored "-Wanalyzer-use-of-uninitialized-value"
108
# endif
109
#endif
110
111
/*
112
 * Context of comparison operation.
113
 */
114
struct context
115
{
116
  #ifdef ELEMENT
117
  /* Vectors being compared.  */
118
  ELEMENT const *xvec;
119
  ELEMENT const *yvec;
120
  #endif
121
122
  /* Extra fields.  */
123
  EXTRA_CONTEXT_FIELDS
124
125
  /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
126
     furthest along the given diagonal in the forward search of the edit
127
     matrix.  */
128
  OFFSET *fdiag;
129
130
  /* Vector, indexed by diagonal, containing the X coordinate of the point
131
     furthest along the given diagonal in the backward search of the edit
132
     matrix.  */
133
  OFFSET *bdiag;
134
135
  #ifdef USE_HEURISTIC
136
  /* This corresponds to the diff --speed-large-files flag.  With this
137
     heuristic, for vectors with a constant small density of changes,
138
     the algorithm is linear in the vector size.  */
139
  bool heuristic;
140
  #endif
141
142
  /* Edit scripts longer than this are too expensive to compute.  */
143
  OFFSET too_expensive;
144
145
  /* Snakes bigger than this are considered "big".  */
146
0
  #define SNAKE_LIMIT 20
147
};
148
149
struct partition
150
{
151
  /* Midpoints of this partition.  */
152
  OFFSET xmid;
153
  OFFSET ymid;
154
155
  /* True if low half will be analyzed minimally.  */
156
  bool lo_minimal;
157
158
  /* Likewise for high half.  */
159
  bool hi_minimal;
160
};
161
162
163
/* Find the midpoint of the shortest edit script for a specified portion
164
   of the two vectors.
165
166
   Scan from the beginnings of the vectors, and simultaneously from the ends,
167
   doing a breadth-first search through the space of edit-sequence.
168
   When the two searches meet, we have found the midpoint of the shortest
169
   edit sequence.
170
171
   If FIND_MINIMAL is true, find the minimal edit script regardless of
172
   expense.  Otherwise, if the search is too expensive, use heuristics to
173
   stop the search and report a suboptimal answer.
174
175
   Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
176
   XMID - YMID equals the number of inserted elements minus the number
177
   of deleted elements (counting only elements before the midpoint).
178
179
   Set PART->lo_minimal to true iff the minimal edit script for the
180
   left half of the partition is known; similarly for PART->hi_minimal.
181
182
   This function assumes that the first elements of the specified portions
183
   of the two vectors do not match, and likewise that the last elements do not
184
   match.  The caller must trim matching elements from the beginning and end
185
   of the portions it is going to specify.
186
187
   If we return the "wrong" partitions, the worst this can do is cause
188
   suboptimal diff output.  It cannot cause incorrect diff output.  */
189
190
static void
191
diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
192
      struct partition *part, struct context *ctxt)
193
0
{
194
0
  OFFSET *const fd = ctxt->fdiag;       /* Give the compiler a chance. */
195
0
  OFFSET *const bd = ctxt->bdiag;       /* Additional help for the compiler. */
196
0
#ifdef ELEMENT
197
0
  ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
198
0
  ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
199
0
  #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
200
#else
201
  #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
202
#endif
203
0
  const OFFSET dmin = xoff - ylim;      /* Minimum valid diagonal. */
204
0
  const OFFSET dmax = xlim - yoff;      /* Maximum valid diagonal. */
205
0
  const OFFSET fmid = xoff - yoff;      /* Center diagonal of top-down search. */
206
0
  const OFFSET bmid = xlim - ylim;      /* Center diagonal of bottom-up search. */
207
0
  OFFSET fmin = fmid;
208
0
  OFFSET fmax = fmid;           /* Limits of top-down search. */
209
0
  OFFSET bmin = bmid;
210
0
  OFFSET bmax = bmid;           /* Limits of bottom-up search. */
211
0
  OFFSET c;                     /* Cost. */
212
0
  bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
213
                                   diagonal with respect to the northwest. */
214
215
0
  fd[fmid] = xoff;
216
0
  bd[bmid] = xlim;
217
218
0
  for (c = 1;; ++c)
219
0
    {
220
0
      OFFSET d;                 /* Active diagonal. */
221
0
      bool big_snake = false;
222
223
      /* Extend the top-down search by an edit step in each diagonal. */
224
0
      if (fmin > dmin)
225
0
        fd[--fmin - 1] = -1;
226
0
      else
227
0
        ++fmin;
228
0
      if (fmax < dmax)
229
0
        fd[++fmax + 1] = -1;
230
0
      else
231
0
        --fmax;
232
0
      for (d = fmax; d >= fmin; d -= 2)
233
0
        {
234
0
          OFFSET x;
235
0
          OFFSET y;
236
0
          OFFSET tlo = fd[d - 1];
237
0
          OFFSET thi = fd[d + 1];
238
0
          OFFSET x0 = tlo < thi ? thi : tlo + 1;
239
240
0
          for (x = x0, y = x0 - d;
241
0
               x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
242
0
               x++, y++)
243
0
            continue;
244
0
          if (x - x0 > SNAKE_LIMIT)
245
0
            big_snake = true;
246
0
          fd[d] = x;
247
0
          if (odd && bmin <= d && d <= bmax && bd[d] <= x)
248
0
            {
249
0
              part->xmid = x;
250
0
              part->ymid = y;
251
0
              part->lo_minimal = part->hi_minimal = true;
252
0
              return;
253
0
            }
254
0
        }
255
256
      /* Similarly extend the bottom-up search.  */
257
0
      if (bmin > dmin)
258
0
        bd[--bmin - 1] = OFFSET_MAX;
259
0
      else
260
0
        ++bmin;
261
0
      if (bmax < dmax)
262
0
        bd[++bmax + 1] = OFFSET_MAX;
263
0
      else
264
0
        --bmax;
265
0
      for (d = bmax; d >= bmin; d -= 2)
266
0
        {
267
0
          OFFSET x;
268
0
          OFFSET y;
269
0
          OFFSET tlo = bd[d - 1];
270
0
          OFFSET thi = bd[d + 1];
271
0
          OFFSET x0 = tlo < thi ? tlo : thi - 1;
272
273
0
          for (x = x0, y = x0 - d;
274
0
               xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
275
0
               x--, y--)
276
0
            continue;
277
0
          if (x0 - x > SNAKE_LIMIT)
278
0
            big_snake = true;
279
0
          bd[d] = x;
280
0
          if (!odd && fmin <= d && d <= fmax && x <= fd[d])
281
0
            {
282
0
              part->xmid = x;
283
0
              part->ymid = y;
284
0
              part->lo_minimal = part->hi_minimal = true;
285
0
              return;
286
0
            }
287
0
        }
288
289
0
      if (!find_minimal)
290
0
        {
291
#ifdef USE_HEURISTIC
292
          bool heuristic = ctxt->heuristic;
293
#else
294
0
          bool heuristic = false;
295
0
#endif
296
297
          /* Heuristic: check occasionally for a diagonal that has made lots
298
             of progress compared with the edit distance.  If we have any
299
             such, find the one that has made the most progress and return it
300
             as if it had succeeded.
301
302
             With this heuristic, for vectors with a constant small density
303
             of changes, the algorithm is linear in the vector size.  */
304
305
0
          if (200 < c && big_snake && heuristic)
306
0
            {
307
0
              {
308
0
                OFFSET best = 0;
309
310
0
                for (d = fmax; d >= fmin; d -= 2)
311
0
                  {
312
0
                    OFFSET dd = d - fmid;
313
0
                    OFFSET x = fd[d];
314
0
                    OFFSET y = x - d;
315
0
                    OFFSET v = (x - xoff) * 2 - dd;
316
317
0
                    if (v > 12 * (c + (dd < 0 ? -dd : dd)))
318
0
                      {
319
0
                        if (v > best
320
0
                            && xoff + SNAKE_LIMIT <= x && x < xlim
321
0
                            && yoff + SNAKE_LIMIT <= y && y < ylim)
322
0
                          {
323
                            /* We have a good enough best diagonal; now insist
324
                               that it end with a significant snake.  */
325
0
                            for (int k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
326
0
                              if (k == SNAKE_LIMIT)
327
0
                                {
328
0
                                  best = v;
329
0
                                  part->xmid = x;
330
0
                                  part->ymid = y;
331
0
                                  break;
332
0
                                }
333
0
                          }
334
0
                      }
335
0
                  }
336
0
                if (best > 0)
337
0
                  {
338
0
                    part->lo_minimal = true;
339
0
                    part->hi_minimal = false;
340
0
                    return;
341
0
                  }
342
0
              }
343
344
0
              {
345
0
                OFFSET best = 0;
346
347
0
                for (d = bmax; d >= bmin; d -= 2)
348
0
                  {
349
0
                    OFFSET dd = d - bmid;
350
0
                    OFFSET x = bd[d];
351
0
                    OFFSET y = x - d;
352
0
                    OFFSET v = (xlim - x) * 2 + dd;
353
354
0
                    if (v > 12 * (c + (dd < 0 ? -dd : dd)))
355
0
                      {
356
0
                        if (v > best
357
0
                            && xoff < x && x <= xlim - SNAKE_LIMIT
358
0
                            && yoff < y && y <= ylim - SNAKE_LIMIT)
359
0
                          {
360
                            /* We have a good enough best diagonal; now insist
361
                               that it end with a significant snake.  */
362
0
                            for (int k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
363
0
                              if (k == SNAKE_LIMIT - 1)
364
0
                                {
365
0
                                  best = v;
366
0
                                  part->xmid = x;
367
0
                                  part->ymid = y;
368
0
                                  break;
369
0
                                }
370
0
                          }
371
0
                      }
372
0
                  }
373
0
                if (best > 0)
374
0
                  {
375
0
                    part->lo_minimal = false;
376
0
                    part->hi_minimal = true;
377
0
                    return;
378
0
                  }
379
0
              }
380
0
            }
381
382
          /* Heuristic: if we've gone well beyond the call of duty, give up
383
             and report halfway between our best results so far.  */
384
0
          if (c >= ctxt->too_expensive)
385
0
            {
386
              /* Find forward diagonal that maximizes X + Y.  */
387
0
              OFFSET fxybest = -1, fxbest;
388
0
              for (d = fmax; d >= fmin; d -= 2)
389
0
                {
390
0
                  OFFSET x = MIN (fd[d], xlim);
391
0
                  OFFSET y = x - d;
392
0
                  if (ylim < y)
393
0
                    {
394
0
                      x = ylim + d;
395
0
                      y = ylim;
396
0
                    }
397
0
                  if (fxybest < x + y)
398
0
                    {
399
0
                      fxybest = x + y;
400
0
                      fxbest = x;
401
0
                    }
402
0
                }
403
404
              /* Find backward diagonal that minimizes X + Y.  */
405
0
              OFFSET bxybest = OFFSET_MAX, bxbest;
406
0
              for (d = bmax; d >= bmin; d -= 2)
407
0
                {
408
0
                  OFFSET x = MAX (xoff, bd[d]);
409
0
                  OFFSET y = x - d;
410
0
                  if (y < yoff)
411
0
                    {
412
0
                      x = yoff + d;
413
0
                      y = yoff;
414
0
                    }
415
0
                  if (x + y < bxybest)
416
0
                    {
417
0
                      bxybest = x + y;
418
0
                      bxbest = x;
419
0
                    }
420
0
                }
421
422
              /* Use the better of the two diagonals.  */
423
0
              if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
424
0
                {
425
0
                  part->xmid = fxbest;
426
0
                  part->ymid = fxybest - fxbest;
427
0
                  part->lo_minimal = true;
428
0
                  part->hi_minimal = false;
429
0
                }
430
0
              else
431
0
                {
432
0
                  part->xmid = bxbest;
433
0
                  part->ymid = bxybest - bxbest;
434
0
                  part->lo_minimal = false;
435
0
                  part->hi_minimal = true;
436
0
                }
437
0
              return;
438
0
            }
439
0
        }
440
0
    }
441
0
  #undef XREF_YREF_EQUAL
442
0
}
443
444
445
/* Compare in detail contiguous subsequences of the two vectors
446
   which are known, as a whole, to match each other.
447
448
   The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
449
450
   Note that XLIM, YLIM are exclusive bounds.  All indices into the vectors
451
   are origin-0.
452
453
   If FIND_MINIMAL, find a minimal difference no matter how
454
   expensive it is.
455
456
   The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
457
458
   Return false if terminated normally, or true if terminated through early
459
   abort.  */
460
461
static bool
462
compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
463
            bool find_minimal, struct context *ctxt)
464
0
{
465
0
#ifdef ELEMENT
466
0
  ELEMENT const *xv = ctxt->xvec; /* Help the compiler.  */
467
0
  ELEMENT const *yv = ctxt->yvec;
468
0
  #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
469
#else
470
  #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
471
#endif
472
473
0
  while (true)
474
0
    {
475
      /* Slide down the bottom initial diagonal.  */
476
0
      while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff))
477
0
        {
478
0
          xoff++;
479
0
          yoff++;
480
0
        }
481
482
      /* Slide up the top initial diagonal. */
483
0
      while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1))
484
0
        {
485
0
          xlim--;
486
0
          ylim--;
487
0
        }
488
489
      /* Handle simple cases. */
490
0
      if (xoff == xlim)
491
0
        {
492
0
          while (yoff < ylim)
493
0
            {
494
0
              NOTE_INSERT (ctxt, yoff);
495
0
              if (EARLY_ABORT (ctxt))
496
0
                return true;
497
0
              yoff++;
498
0
            }
499
0
          break;
500
0
        }
501
0
      if (yoff == ylim)
502
0
        {
503
0
          while (xoff < xlim)
504
0
            {
505
0
              NOTE_DELETE (ctxt, xoff);
506
0
              if (EARLY_ABORT (ctxt))
507
0
                return true;
508
0
              xoff++;
509
0
            }
510
0
          break;
511
0
        }
512
513
0
      struct partition part;
514
515
      /* Find a point of correspondence in the middle of the vectors.  */
516
0
      diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
517
518
      /* Use the partitions to split this problem into subproblems.  */
519
0
      OFFSET xoff1, xlim1, yoff1, ylim1, xoff2, xlim2, yoff2, ylim2;
520
0
      bool find_minimal1, find_minimal2;
521
0
      if (!NOTE_ORDERED
522
0
          && ((xlim + ylim) - (part.xmid + part.ymid)
523
0
              < (part.xmid + part.ymid) - (xoff + yoff)))
524
0
        {
525
          /* The second problem is smaller and the caller doesn't
526
             care about order, so do the second problem first to
527
             lessen recursion.  */
528
0
          xoff1 = part.xmid; xlim1 = xlim;
529
0
          yoff1 = part.ymid; ylim1 = ylim;
530
0
          find_minimal1 = part.hi_minimal;
531
532
0
          xoff2 = xoff; xlim2 = part.xmid;
533
0
          yoff2 = yoff; ylim2 = part.ymid;
534
0
          find_minimal2 = part.lo_minimal;
535
0
        }
536
0
      else
537
0
        {
538
0
          xoff1 = xoff; xlim1 = part.xmid;
539
0
          yoff1 = yoff; ylim1 = part.ymid;
540
0
          find_minimal1 = part.lo_minimal;
541
542
0
          xoff2 = part.xmid; xlim2 = xlim;
543
0
          yoff2 = part.ymid; ylim2 = ylim;
544
0
          find_minimal2 = part.hi_minimal;
545
0
        }
546
547
      /* Recurse to do one subproblem.  */
548
0
      bool early = compareseq (xoff1, xlim1, yoff1, ylim1, find_minimal1, ctxt);
549
0
      if (early)
550
0
        return early;
551
552
      /* Iterate to do the other subproblem.  */
553
0
      xoff = xoff2; xlim = xlim2;
554
0
      yoff = yoff2; ylim = ylim2;
555
0
      find_minimal = find_minimal2;
556
0
    }
557
558
0
  return false;
559
0
  #undef XREF_YREF_EQUAL
560
0
}
561
562
#if _GL_GNUC_PREREQ (4, 7)
563
# pragma GCC diagnostic pop
564
#endif
565
566
#undef ELEMENT
567
#undef EQUAL
568
#undef OFFSET
569
#undef EXTRA_CONTEXT_FIELDS
570
#undef NOTE_DELETE
571
#undef NOTE_INSERT
572
#undef EARLY_ABORT
573
#undef USE_HEURISTIC
574
#undef XVECREF_YVECREF_EQUAL
575
#undef OFFSET_MAX