Coverage Report

Created: 2026-01-25 07:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gettext-0.26/gettext-tools/libgettextpo/diffseq.h
Line
Count
Source
1
/* Analyze differences between two vectors.
2
3
   Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2025 Free Software
4
   Foundation, Inc.
5
6
   This program is free software: you can redistribute it and/or modify
7
   it under the terms of the GNU General Public License as published by
8
   the Free Software Foundation, either version 3 of the License, or
9
   (at your option) any later version.
10
11
   This program is distributed in the hope that it will be useful,
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
   GNU General Public License for more details.
15
16
   You should have received a copy of the GNU General Public License
17
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
18
19
20
/* The basic idea is to consider two vectors as similar if, when
21
   transforming the first vector into the second vector through a
22
   sequence of edits (inserts and deletes of one element each),
23
   this sequence is short - or equivalently, if the ordered list
24
   of elements that are untouched by these edits is long.  For a
25
   good introduction to the subject, read about the "Levenshtein
26
   distance" in Wikipedia.
27
28
   The basic algorithm is described in:
29
   "An O(ND) Difference Algorithm and its Variations", Eugene W. Myers,
30
   Algorithmica Vol. 1, 1986, pp. 251-266,
31
   <https://doi.org/10.1007/BF01840446>.
32
   See especially section 4.2, which describes the variation used below.
33
34
   The basic algorithm was independently discovered as described in:
35
   "Algorithms for Approximate String Matching", Esko Ukkonen,
36
   Information and Control Vol. 64, 1985, pp. 100-118,
37
   <https://doi.org/10.1016/S0019-9958(85)80046-2>.
38
39
   Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
40
   heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
41
   at the price of producing suboptimal output for large inputs with
42
   many differences.  */
43
44
/* Before including this file, you need to define:
45
     ELEMENT                 The element type of the vectors being compared.
46
     EQUAL                   A two-argument macro that tests two elements for
47
                             equality.
48
     OFFSET                  A signed integer type sufficient to hold the
49
                             difference between two indices.  Usually
50
                             something like ptrdiff_t.
51
     OFFSET_MAX              (Optional) The maximum value of OFFSET (e.g.,
52
                             PTRDIFF_MAX).  If omitted, it is inferred in a
53
                             way portable to the vast majority of C platforms,
54
                             as they lack padding bits.
55
     EXTRA_CONTEXT_FIELDS    Declarations of fields for 'struct context'.
56
     NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
57
     NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
58
     NOTE_ORDERED            (Optional) A boolean expression saying that
59
                             NOTE_DELETE and NOTE_INSERT calls must be
60
                             issued in offset order.
61
     EARLY_ABORT(ctxt)       (Optional) A boolean expression that triggers an
62
                             early abort of the computation.
63
     USE_HEURISTIC           (Optional) Define if you want to support the
64
                             heuristic for large vectors.
65
66
   It is also possible to use this file with abstract arrays.  In this case,
67
   xvec and yvec are not represented in memory.  They only exist conceptually.
68
   In this case, the list of defines above is amended as follows:
69
     ELEMENT                 Undefined.
70
     EQUAL                   Undefined.
71
     XVECREF_YVECREF_EQUAL(ctxt, xoff, yoff)
72
                             A three-argument macro: References xvec[xoff] and
73
                             yvec[yoff] and tests these elements for equality.
74
75
   Before including this file, you also need to include:
76
     #include <limits.h>
77
     #include "minmax.h"
78
 */
79
80
/* This file uses _GL_GNUC_PREREQ.  */
81
#if !_GL_CONFIG_H_INCLUDED
82
 #error "Please include config.h first."
83
#endif
84
85
/* Maximum value of type OFFSET.  */
86
#ifndef OFFSET_MAX
87
# define OFFSET_MAX \
88
   ((((OFFSET) 1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
89
#endif
90
91
/* Default to no early abort.  */
92
#ifndef EARLY_ABORT
93
# define EARLY_ABORT(ctxt) false
94
#endif
95
96
#ifndef NOTE_ORDERED
97
# define NOTE_ORDERED false
98
#endif
99
100
/* Suppress gcc's "...may be used before initialized" warnings,
101
   generated by GCC versions up to at least GCC 15.1.
102
   Likewise for gcc -fanalyzer's "use of uninitialized value" warnings.  */
103
#if _GL_GNUC_PREREQ (4, 7)
104
# pragma GCC diagnostic push
105
# pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
106
# if _GL_GNUC_PREREQ (13, 0)
107
#  pragma GCC diagnostic ignored "-Wanalyzer-use-of-uninitialized-value"
108
# endif
109
#endif
110
111
/*
112
 * Context of comparison operation.
113
 */
114
struct context
115
{
116
  #ifdef ELEMENT
117
  /* Vectors being compared.  */
118
  ELEMENT const *xvec;
119
  ELEMENT const *yvec;
120
  #endif
121
122
  /* Extra fields.  */
123
  EXTRA_CONTEXT_FIELDS
124
125
  /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
126
     furthest along the given diagonal in the forward search of the edit
127
     matrix.  */
128
  OFFSET *fdiag;
129
130
  /* Vector, indexed by diagonal, containing the X coordinate of the point
131
     furthest along the given diagonal in the backward search of the edit
132
     matrix.  */
133
  OFFSET *bdiag;
134
135
  #ifdef USE_HEURISTIC
136
  /* This corresponds to the diff --speed-large-files flag.  With this
137
     heuristic, for vectors with a constant small density of changes,
138
     the algorithm is linear in the vector size.  */
139
  bool heuristic;
140
  #endif
141
142
  /* Edit scripts longer than this are too expensive to compute.  */
143
  OFFSET too_expensive;
144
145
  /* Snakes bigger than this are considered "big".  */
146
0
  #define SNAKE_LIMIT 20
147
};
148
149
struct partition
150
{
151
  /* Midpoints of this partition.  */
152
  OFFSET xmid;
153
  OFFSET ymid;
154
155
  /* True if low half will be analyzed minimally.  */
156
  bool lo_minimal;
157
158
  /* Likewise for high half.  */
159
  bool hi_minimal;
160
};
161
162
163
/* Find the midpoint of the shortest edit script for a specified portion
164
   of the two vectors.
165
166
   Scan from the beginnings of the vectors, and simultaneously from the ends,
167
   doing a breadth-first search through the space of edit-sequence.
168
   When the two searches meet, we have found the midpoint of the shortest
169
   edit sequence.
170
171
   If FIND_MINIMAL is true, find the minimal edit script regardless of
172
   expense.  Otherwise, if the search is too expensive, use heuristics to
173
   stop the search and report a suboptimal answer.
174
175
   Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
176
   XMID - YMID equals the number of inserted elements minus the number
177
   of deleted elements (counting only elements before the midpoint).
178
179
   Set PART->lo_minimal to true iff the minimal edit script for the
180
   left half of the partition is known; similarly for PART->hi_minimal.
181
182
   This function assumes that the first elements of the specified portions
183
   of the two vectors do not match, and likewise that the last elements do not
184
   match.  The caller must trim matching elements from the beginning and end
185
   of the portions it is going to specify.
186
187
   If we return the "wrong" partitions, the worst this can do is cause
188
   suboptimal diff output.  It cannot cause incorrect diff output.  */
189
190
static void
191
diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
192
      struct partition *part, struct context *ctxt)
193
0
{
194
0
  OFFSET *const fd = ctxt->fdiag;       /* Give the compiler a chance. */
195
0
  OFFSET *const bd = ctxt->bdiag;       /* Additional help for the compiler. */
196
0
#ifdef ELEMENT
197
0
  ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
198
0
  ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
199
0
  #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
200
#else
201
  #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
202
#endif
203
0
  const OFFSET dmin = xoff - ylim;      /* Minimum valid diagonal. */
204
0
  const OFFSET dmax = xlim - yoff;      /* Maximum valid diagonal. */
205
0
  const OFFSET fmid = xoff - yoff;      /* Center diagonal of top-down search. */
206
0
  const OFFSET bmid = xlim - ylim;      /* Center diagonal of bottom-up search. */
207
0
  OFFSET fmin = fmid;
208
0
  OFFSET fmax = fmid;           /* Limits of top-down search. */
209
0
  OFFSET bmin = bmid;
210
0
  OFFSET bmax = bmid;           /* Limits of bottom-up search. */
211
0
  OFFSET c;                     /* Cost. */
212
0
  bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
213
                                   diagonal with respect to the northwest. */
214
215
0
  fd[fmid] = xoff;
216
0
  bd[bmid] = xlim;
217
218
0
  for (c = 1;; ++c)
219
0
    {
220
0
      OFFSET d;                 /* Active diagonal. */
221
0
      bool big_snake = false;
222
223
      /* Extend the top-down search by an edit step in each diagonal. */
224
0
      if (fmin > dmin)
225
0
        fd[--fmin - 1] = -1;
226
0
      else
227
0
        ++fmin;
228
0
      if (fmax < dmax)
229
0
        fd[++fmax + 1] = -1;
230
0
      else
231
0
        --fmax;
232
0
      for (d = fmax; d >= fmin; d -= 2)
233
0
        {
234
0
          OFFSET x;
235
0
          OFFSET y;
236
0
          OFFSET tlo = fd[d - 1];
237
0
          OFFSET thi = fd[d + 1];
238
0
          OFFSET x0 = tlo < thi ? thi : tlo + 1;
239
240
0
          for (x = x0, y = x0 - d;
241
0
               x < xlim && y < ylim && XREF_YREF_EQUAL (x, y);
242
0
               x++, y++)
243
0
            continue;
244
0
          if (x - x0 > SNAKE_LIMIT)
245
0
            big_snake = true;
246
0
          fd[d] = x;
247
0
          if (odd && bmin <= d && d <= bmax && bd[d] <= x)
248
0
            {
249
0
              part->xmid = x;
250
0
              part->ymid = y;
251
0
              part->lo_minimal = part->hi_minimal = true;
252
0
              return;
253
0
            }
254
0
        }
255
256
      /* Similarly extend the bottom-up search.  */
257
0
      if (bmin > dmin)
258
0
        bd[--bmin - 1] = OFFSET_MAX;
259
0
      else
260
0
        ++bmin;
261
0
      if (bmax < dmax)
262
0
        bd[++bmax + 1] = OFFSET_MAX;
263
0
      else
264
0
        --bmax;
265
0
      for (d = bmax; d >= bmin; d -= 2)
266
0
        {
267
0
          OFFSET x;
268
0
          OFFSET y;
269
0
          OFFSET tlo = bd[d - 1];
270
0
          OFFSET thi = bd[d + 1];
271
0
          OFFSET x0 = tlo < thi ? tlo : thi - 1;
272
273
0
          for (x = x0, y = x0 - d;
274
0
               xoff < x && yoff < y && XREF_YREF_EQUAL (x - 1, y - 1);
275
0
               x--, y--)
276
0
            continue;
277
0
          if (x0 - x > SNAKE_LIMIT)
278
0
            big_snake = true;
279
0
          bd[d] = x;
280
0
          if (!odd && fmin <= d && d <= fmax && x <= fd[d])
281
0
            {
282
0
              part->xmid = x;
283
0
              part->ymid = y;
284
0
              part->lo_minimal = part->hi_minimal = true;
285
0
              return;
286
0
            }
287
0
        }
288
289
0
      if (find_minimal)
290
0
        continue;
291
292
#ifdef USE_HEURISTIC
293
      bool heuristic = ctxt->heuristic;
294
#else
295
0
      bool heuristic = false;
296
0
#endif
297
298
      /* Heuristic: check occasionally for a diagonal that has made lots
299
         of progress compared with the edit distance.  If we have any
300
         such, find the one that has made the most progress and return it
301
         as if it had succeeded.
302
303
         With this heuristic, for vectors with a constant small density
304
         of changes, the algorithm is linear in the vector size.  */
305
306
0
      if (200 < c && big_snake && heuristic)
307
0
        {
308
0
          {
309
0
            OFFSET best = 0;
310
311
0
            for (d = fmax; d >= fmin; d -= 2)
312
0
              {
313
0
                OFFSET dd = d - fmid;
314
0
                OFFSET x = fd[d];
315
0
                OFFSET y = x - d;
316
0
                OFFSET v = (x - xoff) * 2 - dd;
317
318
0
                if (v > 12 * (c + (dd < 0 ? -dd : dd)))
319
0
                  {
320
0
                    if (v > best
321
0
                        && xoff + SNAKE_LIMIT <= x && x < xlim
322
0
                        && yoff + SNAKE_LIMIT <= y && y < ylim)
323
0
                      {
324
                        /* We have a good enough best diagonal; now insist
325
                           that it end with a significant snake.  */
326
0
                        int k;
327
328
0
                        for (k = 1; XREF_YREF_EQUAL (x - k, y - k); k++)
329
0
                          if (k == SNAKE_LIMIT)
330
0
                            {
331
0
                              best = v;
332
0
                              part->xmid = x;
333
0
                              part->ymid = y;
334
0
                              break;
335
0
                            }
336
0
                      }
337
0
                  }
338
0
              }
339
0
            if (best > 0)
340
0
              {
341
0
                part->lo_minimal = true;
342
0
                part->hi_minimal = false;
343
0
                return;
344
0
              }
345
0
          }
346
347
0
          {
348
0
            OFFSET best = 0;
349
350
0
            for (d = bmax; d >= bmin; d -= 2)
351
0
              {
352
0
                OFFSET dd = d - bmid;
353
0
                OFFSET x = bd[d];
354
0
                OFFSET y = x - d;
355
0
                OFFSET v = (xlim - x) * 2 + dd;
356
357
0
                if (v > 12 * (c + (dd < 0 ? -dd : dd)))
358
0
                  {
359
0
                    if (v > best
360
0
                        && xoff < x && x <= xlim - SNAKE_LIMIT
361
0
                        && yoff < y && y <= ylim - SNAKE_LIMIT)
362
0
                      {
363
                        /* We have a good enough best diagonal; now insist
364
                           that it end with a significant snake.  */
365
0
                        int k;
366
367
0
                        for (k = 0; XREF_YREF_EQUAL (x + k, y + k); k++)
368
0
                          if (k == SNAKE_LIMIT - 1)
369
0
                            {
370
0
                              best = v;
371
0
                              part->xmid = x;
372
0
                              part->ymid = y;
373
0
                              break;
374
0
                            }
375
0
                      }
376
0
                  }
377
0
              }
378
0
            if (best > 0)
379
0
              {
380
0
                part->lo_minimal = false;
381
0
                part->hi_minimal = true;
382
0
                return;
383
0
              }
384
0
          }
385
0
        }
386
387
      /* Heuristic: if we've gone well beyond the call of duty, give up
388
         and report halfway between our best results so far.  */
389
0
      if (c >= ctxt->too_expensive)
390
0
        {
391
          /* Find forward diagonal that maximizes X + Y.  */
392
0
          OFFSET fxybest = -1, fxbest;
393
0
          for (d = fmax; d >= fmin; d -= 2)
394
0
            {
395
0
              OFFSET x = MIN (fd[d], xlim);
396
0
              OFFSET y = x - d;
397
0
              if (ylim < y)
398
0
                {
399
0
                  x = ylim + d;
400
0
                  y = ylim;
401
0
                }
402
0
              if (fxybest < x + y)
403
0
                {
404
0
                  fxybest = x + y;
405
0
                  fxbest = x;
406
0
                }
407
0
            }
408
409
          /* Find backward diagonal that minimizes X + Y.  */
410
0
          OFFSET bxybest = OFFSET_MAX, bxbest;
411
0
          for (d = bmax; d >= bmin; d -= 2)
412
0
            {
413
0
              OFFSET x = MAX (xoff, bd[d]);
414
0
              OFFSET y = x - d;
415
0
              if (y < yoff)
416
0
                {
417
0
                  x = yoff + d;
418
0
                  y = yoff;
419
0
                }
420
0
              if (x + y < bxybest)
421
0
                {
422
0
                  bxybest = x + y;
423
0
                  bxbest = x;
424
0
                }
425
0
            }
426
427
          /* Use the better of the two diagonals.  */
428
0
          if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
429
0
            {
430
0
              part->xmid = fxbest;
431
0
              part->ymid = fxybest - fxbest;
432
0
              part->lo_minimal = true;
433
0
              part->hi_minimal = false;
434
0
            }
435
0
          else
436
0
            {
437
0
              part->xmid = bxbest;
438
0
              part->ymid = bxybest - bxbest;
439
0
              part->lo_minimal = false;
440
0
              part->hi_minimal = true;
441
0
            }
442
0
          return;
443
0
        }
444
0
    }
445
0
  #undef XREF_YREF_EQUAL
446
0
}
447
448
449
/* Compare in detail contiguous subsequences of the two vectors
450
   which are known, as a whole, to match each other.
451
452
   The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
453
454
   Note that XLIM, YLIM are exclusive bounds.  All indices into the vectors
455
   are origin-0.
456
457
   If FIND_MINIMAL, find a minimal difference no matter how
458
   expensive it is.
459
460
   The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
461
462
   Return false if terminated normally, or true if terminated through early
463
   abort.  */
464
465
static bool
466
compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
467
            bool find_minimal, struct context *ctxt)
468
0
{
469
0
#ifdef ELEMENT
470
0
  ELEMENT const *xv = ctxt->xvec; /* Help the compiler.  */
471
0
  ELEMENT const *yv = ctxt->yvec;
472
0
  #define XREF_YREF_EQUAL(x,y)  EQUAL (xv[x], yv[y])
473
#else
474
  #define XREF_YREF_EQUAL(x,y)  XVECREF_YVECREF_EQUAL (ctxt, x, y)
475
#endif
476
477
0
  while (true)
478
0
    {
479
      /* Slide down the bottom initial diagonal.  */
480
0
      while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xoff, yoff))
481
0
        {
482
0
          xoff++;
483
0
          yoff++;
484
0
        }
485
486
      /* Slide up the top initial diagonal. */
487
0
      while (xoff < xlim && yoff < ylim && XREF_YREF_EQUAL (xlim - 1, ylim - 1))
488
0
        {
489
0
          xlim--;
490
0
          ylim--;
491
0
        }
492
493
      /* Handle simple cases. */
494
0
      if (xoff == xlim)
495
0
        {
496
0
          while (yoff < ylim)
497
0
            {
498
0
              NOTE_INSERT (ctxt, yoff);
499
0
              if (EARLY_ABORT (ctxt))
500
0
                return true;
501
0
              yoff++;
502
0
            }
503
0
          break;
504
0
        }
505
0
      if (yoff == ylim)
506
0
        {
507
0
          while (xoff < xlim)
508
0
            {
509
0
              NOTE_DELETE (ctxt, xoff);
510
0
              if (EARLY_ABORT (ctxt))
511
0
                return true;
512
0
              xoff++;
513
0
            }
514
0
          break;
515
0
        }
516
517
0
      struct partition part;
518
519
      /* Find a point of correspondence in the middle of the vectors.  */
520
0
      diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
521
522
      /* Use the partitions to split this problem into subproblems.  */
523
0
      OFFSET xoff1, xlim1, yoff1, ylim1, xoff2, xlim2, yoff2, ylim2;
524
0
      bool find_minimal1, find_minimal2;
525
0
      if (!NOTE_ORDERED
526
0
          && ((xlim + ylim) - (part.xmid + part.ymid)
527
0
              < (part.xmid + part.ymid) - (xoff + yoff)))
528
0
        {
529
          /* The second problem is smaller and the caller doesn't
530
             care about order, so do the second problem first to
531
             lessen recursion.  */
532
0
          xoff1 = part.xmid; xlim1 = xlim;
533
0
          yoff1 = part.ymid; ylim1 = ylim;
534
0
          find_minimal1 = part.hi_minimal;
535
536
0
          xoff2 = xoff; xlim2 = part.xmid;
537
0
          yoff2 = yoff; ylim2 = part.ymid;
538
0
          find_minimal2 = part.lo_minimal;
539
0
        }
540
0
      else
541
0
        {
542
0
          xoff1 = xoff; xlim1 = part.xmid;
543
0
          yoff1 = yoff; ylim1 = part.ymid;
544
0
          find_minimal1 = part.lo_minimal;
545
546
0
          xoff2 = part.xmid; xlim2 = xlim;
547
0
          yoff2 = part.ymid; ylim2 = ylim;
548
0
          find_minimal2 = part.hi_minimal;
549
0
        }
550
551
      /* Recurse to do one subproblem.  */
552
0
      bool early = compareseq (xoff1, xlim1, yoff1, ylim1, find_minimal1, ctxt);
553
0
      if (early)
554
0
        return early;
555
556
      /* Iterate to do the other subproblem.  */
557
0
      xoff = xoff2; xlim = xlim2;
558
0
      yoff = yoff2; ylim = ylim2;
559
0
      find_minimal = find_minimal2;
560
0
    }
561
562
0
  return false;
563
0
  #undef XREF_YREF_EQUAL
564
0
}
565
566
#if _GL_GNUC_PREREQ (4, 7)
567
# pragma GCC diagnostic pop
568
#endif
569
570
#undef ELEMENT
571
#undef EQUAL
572
#undef OFFSET
573
#undef EXTRA_CONTEXT_FIELDS
574
#undef NOTE_DELETE
575
#undef NOTE_INSERT
576
#undef EARLY_ABORT
577
#undef USE_HEURISTIC
578
#undef XVECREF_YVECREF_EQUAL
579
#undef OFFSET_MAX