Coverage Report

Created: 2025-12-31 07:01

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/git/xdiff/xdiffi.c
Line
Count
Source
1
/*
2
 *  LibXDiff by Davide Libenzi ( File Differential Library )
3
 *  Copyright (C) 2003  Davide Libenzi
4
 *
5
 *  This library is free software; you can redistribute it and/or
6
 *  modify it under the terms of the GNU Lesser General Public
7
 *  License as published by the Free Software Foundation; either
8
 *  version 2.1 of the License, or (at your option) any later version.
9
 *
10
 *  This library is distributed in the hope that it will be useful,
11
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 *  Lesser General Public License for more details.
14
 *
15
 *  You should have received a copy of the GNU Lesser General Public
16
 *  License along with this library; if not, see
17
 *  <http://www.gnu.org/licenses/>.
18
 *
19
 *  Davide Libenzi <davidel@xmailserver.org>
20
 *
21
 */
22
23
#include "xinclude.h"
24
25
static size_t get_hash(xdfile_t *xdf, long index)
26
0
{
27
0
  return xdf->recs[xdf->reference_index[index]].minimal_perfect_hash;
28
0
}
29
30
0
#define XDL_MAX_COST_MIN 256
31
0
#define XDL_HEUR_MIN_COST 256
32
0
#define XDL_LINE_MAX (long)((1UL << (CHAR_BIT * sizeof(long) - 1)) - 1)
33
0
#define XDL_SNAKE_CNT 20
34
0
#define XDL_K_HEUR 4
35
36
typedef struct s_xdpsplit {
37
  long i1, i2;
38
  int min_lo, min_hi;
39
} xdpsplit_t;
40
41
/*
42
 * See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers.
43
 * Basically considers a "box" (off1, off2, lim1, lim2) and scan from both
44
 * the forward diagonal starting from (off1, off2) and the backward diagonal
45
 * starting from (lim1, lim2). If the K values on the same diagonal crosses
46
 * returns the furthest point of reach. We might encounter expensive edge cases
47
 * using this algorithm, so a little bit of heuristic is needed to cut the
48
 * search and to return a suboptimal point.
49
 */
50
static long xdl_split(xdfile_t *xdf1, long off1, long lim1,
51
          xdfile_t *xdf2, long off2, long lim2,
52
          long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl,
53
0
          xdalgoenv_t *xenv) {
54
0
  long dmin = off1 - lim2, dmax = lim1 - off2;
55
0
  long fmid = off1 - off2, bmid = lim1 - lim2;
56
0
  long odd = (fmid - bmid) & 1;
57
0
  long fmin = fmid, fmax = fmid;
58
0
  long bmin = bmid, bmax = bmid;
59
0
  long ec, d, i1, i2, prev1, best, dd, v, k;
60
61
  /*
62
   * Set initial diagonal values for both forward and backward path.
63
   */
64
0
  kvdf[fmid] = off1;
65
0
  kvdb[bmid] = lim1;
66
67
0
  for (ec = 1;; ec++) {
68
0
    int got_snake = 0;
69
70
    /*
71
     * We need to extend the diagonal "domain" by one. If the next
72
     * values exits the box boundaries we need to change it in the
73
     * opposite direction because (max - min) must be a power of
74
     * two.
75
     *
76
     * Also we initialize the external K value to -1 so that we can
77
     * avoid extra conditions in the check inside the core loop.
78
     */
79
0
    if (fmin > dmin)
80
0
      kvdf[--fmin - 1] = -1;
81
0
    else
82
0
      ++fmin;
83
0
    if (fmax < dmax)
84
0
      kvdf[++fmax + 1] = -1;
85
0
    else
86
0
      --fmax;
87
88
0
    for (d = fmax; d >= fmin; d -= 2) {
89
0
      if (kvdf[d - 1] >= kvdf[d + 1])
90
0
        i1 = kvdf[d - 1] + 1;
91
0
      else
92
0
        i1 = kvdf[d + 1];
93
0
      prev1 = i1;
94
0
      i2 = i1 - d;
95
0
      for (; i1 < lim1 && i2 < lim2 && get_hash(xdf1, i1) == get_hash(xdf2, i2); i1++, i2++);
96
0
      if (i1 - prev1 > xenv->snake_cnt)
97
0
        got_snake = 1;
98
0
      kvdf[d] = i1;
99
0
      if (odd && bmin <= d && d <= bmax && kvdb[d] <= i1) {
100
0
        spl->i1 = i1;
101
0
        spl->i2 = i2;
102
0
        spl->min_lo = spl->min_hi = 1;
103
0
        return ec;
104
0
      }
105
0
    }
106
107
    /*
108
     * We need to extend the diagonal "domain" by one. If the next
109
     * values exits the box boundaries we need to change it in the
110
     * opposite direction because (max - min) must be a power of
111
     * two.
112
     *
113
     * Also we initialize the external K value to -1 so that we can
114
     * avoid extra conditions in the check inside the core loop.
115
     */
116
0
    if (bmin > dmin)
117
0
      kvdb[--bmin - 1] = XDL_LINE_MAX;
118
0
    else
119
0
      ++bmin;
120
0
    if (bmax < dmax)
121
0
      kvdb[++bmax + 1] = XDL_LINE_MAX;
122
0
    else
123
0
      --bmax;
124
125
0
    for (d = bmax; d >= bmin; d -= 2) {
126
0
      if (kvdb[d - 1] < kvdb[d + 1])
127
0
        i1 = kvdb[d - 1];
128
0
      else
129
0
        i1 = kvdb[d + 1] - 1;
130
0
      prev1 = i1;
131
0
      i2 = i1 - d;
132
0
      for (; i1 > off1 && i2 > off2 && get_hash(xdf1, i1 - 1) == get_hash(xdf2, i2 - 1); i1--, i2--);
133
0
      if (prev1 - i1 > xenv->snake_cnt)
134
0
        got_snake = 1;
135
0
      kvdb[d] = i1;
136
0
      if (!odd && fmin <= d && d <= fmax && i1 <= kvdf[d]) {
137
0
        spl->i1 = i1;
138
0
        spl->i2 = i2;
139
0
        spl->min_lo = spl->min_hi = 1;
140
0
        return ec;
141
0
      }
142
0
    }
143
144
0
    if (need_min)
145
0
      continue;
146
147
    /*
148
     * If the edit cost is above the heuristic trigger and if
149
     * we got a good snake, we sample current diagonals to see
150
     * if some of them have reached an "interesting" path. Our
151
     * measure is a function of the distance from the diagonal
152
     * corner (i1 + i2) penalized with the distance from the
153
     * mid diagonal itself. If this value is above the current
154
     * edit cost times a magic factor (XDL_K_HEUR) we consider
155
     * it interesting.
156
     */
157
0
    if (got_snake && ec > xenv->heur_min) {
158
0
      for (best = 0, d = fmax; d >= fmin; d -= 2) {
159
0
        dd = d > fmid ? d - fmid: fmid - d;
160
0
        i1 = kvdf[d];
161
0
        i2 = i1 - d;
162
0
        v = (i1 - off1) + (i2 - off2) - dd;
163
164
0
        if (v > XDL_K_HEUR * ec && v > best &&
165
0
            off1 + xenv->snake_cnt <= i1 && i1 < lim1 &&
166
0
            off2 + xenv->snake_cnt <= i2 && i2 < lim2) {
167
0
          for (k = 1; get_hash(xdf1, i1 - k) == get_hash(xdf2, i2 - k); k++)
168
0
            if (k == xenv->snake_cnt) {
169
0
              best = v;
170
0
              spl->i1 = i1;
171
0
              spl->i2 = i2;
172
0
              break;
173
0
            }
174
0
        }
175
0
      }
176
0
      if (best > 0) {
177
0
        spl->min_lo = 1;
178
0
        spl->min_hi = 0;
179
0
        return ec;
180
0
      }
181
182
0
      for (best = 0, d = bmax; d >= bmin; d -= 2) {
183
0
        dd = d > bmid ? d - bmid: bmid - d;
184
0
        i1 = kvdb[d];
185
0
        i2 = i1 - d;
186
0
        v = (lim1 - i1) + (lim2 - i2) - dd;
187
188
0
        if (v > XDL_K_HEUR * ec && v > best &&
189
0
            off1 < i1 && i1 <= lim1 - xenv->snake_cnt &&
190
0
            off2 < i2 && i2 <= lim2 - xenv->snake_cnt) {
191
0
          for (k = 0; get_hash(xdf1, i1 + k) == get_hash(xdf2, i2 + k); k++)
192
0
            if (k == xenv->snake_cnt - 1) {
193
0
              best = v;
194
0
              spl->i1 = i1;
195
0
              spl->i2 = i2;
196
0
              break;
197
0
            }
198
0
        }
199
0
      }
200
0
      if (best > 0) {
201
0
        spl->min_lo = 0;
202
0
        spl->min_hi = 1;
203
0
        return ec;
204
0
      }
205
0
    }
206
207
    /*
208
     * Enough is enough. We spent too much time here and now we
209
     * collect the furthest reaching path using the (i1 + i2)
210
     * measure.
211
     */
212
0
    if (ec >= xenv->mxcost) {
213
0
      long fbest, fbest1, bbest, bbest1;
214
215
0
      fbest = fbest1 = -1;
216
0
      for (d = fmax; d >= fmin; d -= 2) {
217
0
        i1 = XDL_MIN(kvdf[d], lim1);
218
0
        i2 = i1 - d;
219
0
        if (lim2 < i2) {
220
0
          i1 = lim2 + d;
221
0
          i2 = lim2;
222
0
        }
223
0
        if (fbest < i1 + i2) {
224
0
          fbest = i1 + i2;
225
0
          fbest1 = i1;
226
0
        }
227
0
      }
228
229
0
      bbest = bbest1 = XDL_LINE_MAX;
230
0
      for (d = bmax; d >= bmin; d -= 2) {
231
0
        i1 = XDL_MAX(off1, kvdb[d]);
232
0
        i2 = i1 - d;
233
0
        if (i2 < off2) {
234
0
          i1 = off2 + d;
235
0
          i2 = off2;
236
0
        }
237
0
        if (i1 + i2 < bbest) {
238
0
          bbest = i1 + i2;
239
0
          bbest1 = i1;
240
0
        }
241
0
      }
242
243
0
      if ((lim1 + lim2) - bbest < fbest - (off1 + off2)) {
244
0
        spl->i1 = fbest1;
245
0
        spl->i2 = fbest - fbest1;
246
0
        spl->min_lo = 1;
247
0
        spl->min_hi = 0;
248
0
      } else {
249
0
        spl->i1 = bbest1;
250
0
        spl->i2 = bbest - bbest1;
251
0
        spl->min_lo = 0;
252
0
        spl->min_hi = 1;
253
0
      }
254
0
      return ec;
255
0
    }
256
0
  }
257
0
}
258
259
260
/*
261
 * Rule: "Divide et Impera" (divide & conquer). Recursively split the box in
262
 * sub-boxes by calling the box splitting function. Note that the real job
263
 * (marking changed lines) is done in the two boundary reaching checks.
264
 */
265
int xdl_recs_cmp(xdfile_t *xdf1, long off1, long lim1,
266
     xdfile_t *xdf2, long off2, long lim2,
267
0
     long *kvdf, long *kvdb, int need_min, xdalgoenv_t *xenv) {
268
269
  /*
270
   * Shrink the box by walking through each diagonal snake (SW and NE).
271
   */
272
0
  for (; off1 < lim1 && off2 < lim2 && get_hash(xdf1, off1) == get_hash(xdf2, off2); off1++, off2++);
273
0
  for (; off1 < lim1 && off2 < lim2 && get_hash(xdf1, lim1 - 1) == get_hash(xdf2, lim2 - 1); lim1--, lim2--);
274
275
  /*
276
   * If one dimension is empty, then all records on the other one must
277
   * be obviously changed.
278
   */
279
0
  if (off1 == lim1) {
280
0
    for (; off2 < lim2; off2++)
281
0
      xdf2->changed[xdf2->reference_index[off2]] = true;
282
0
  } else if (off2 == lim2) {
283
0
    for (; off1 < lim1; off1++)
284
0
      xdf1->changed[xdf1->reference_index[off1]] = true;
285
0
  } else {
286
0
    xdpsplit_t spl;
287
0
    spl.i1 = spl.i2 = 0;
288
289
    /*
290
     * Divide ...
291
     */
292
0
    if (xdl_split(xdf1, off1, lim1, xdf2, off2, lim2, kvdf, kvdb,
293
0
            need_min, &spl, xenv) < 0) {
294
295
0
      return -1;
296
0
    }
297
298
    /*
299
     * ... et Impera.
300
     */
301
0
    if (xdl_recs_cmp(xdf1, off1, spl.i1, xdf2, off2, spl.i2,
302
0
         kvdf, kvdb, spl.min_lo, xenv) < 0 ||
303
0
        xdl_recs_cmp(xdf1, spl.i1, lim1, xdf2, spl.i2, lim2,
304
0
         kvdf, kvdb, spl.min_hi, xenv) < 0) {
305
306
0
      return -1;
307
0
    }
308
0
  }
309
310
0
  return 0;
311
0
}
312
313
314
int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
315
0
    xdfenv_t *xe) {
316
0
  long ndiags;
317
0
  long *kvd, *kvdf, *kvdb;
318
0
  xdalgoenv_t xenv;
319
0
  int res;
320
321
0
  if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0)
322
0
    return -1;
323
324
0
  if (XDF_DIFF_ALG(xpp->flags) == XDF_PATIENCE_DIFF) {
325
0
    res = xdl_do_patience_diff(xpp, xe);
326
0
    goto out;
327
0
  }
328
329
0
  if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF) {
330
0
    res = xdl_do_histogram_diff(xpp, xe);
331
0
    goto out;
332
0
  }
333
334
  /*
335
   * Allocate and setup K vectors to be used by the differential
336
   * algorithm.
337
   *
338
   * One is to store the forward path and one to store the backward path.
339
   */
340
0
  ndiags = xe->xdf1.nreff + xe->xdf2.nreff + 3;
341
0
  if (!XDL_ALLOC_ARRAY(kvd, 2 * ndiags + 2)) {
342
343
0
    xdl_free_env(xe);
344
0
    return -1;
345
0
  }
346
0
  kvdf = kvd;
347
0
  kvdb = kvdf + ndiags;
348
0
  kvdf += xe->xdf2.nreff + 1;
349
0
  kvdb += xe->xdf2.nreff + 1;
350
351
0
  xenv.mxcost = xdl_bogosqrt(ndiags);
352
0
  if (xenv.mxcost < XDL_MAX_COST_MIN)
353
0
    xenv.mxcost = XDL_MAX_COST_MIN;
354
0
  xenv.snake_cnt = XDL_SNAKE_CNT;
355
0
  xenv.heur_min = XDL_HEUR_MIN_COST;
356
357
0
  res = xdl_recs_cmp(&xe->xdf1, 0, xe->xdf1.nreff, &xe->xdf2, 0, xe->xdf2.nreff,
358
0
         kvdf, kvdb, (xpp->flags & XDF_NEED_MINIMAL) != 0,
359
0
         &xenv);
360
0
  xdl_free(kvd);
361
0
 out:
362
0
  if (res < 0)
363
0
    xdl_free_env(xe);
364
365
0
  return res;
366
0
}
367
368
369
0
static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2) {
370
0
  xdchange_t *xch;
371
372
0
  if (!(xch = (xdchange_t *) xdl_malloc(sizeof(xdchange_t))))
373
0
    return NULL;
374
375
0
  xch->next = xscr;
376
0
  xch->i1 = i1;
377
0
  xch->i2 = i2;
378
0
  xch->chg1 = chg1;
379
0
  xch->chg2 = chg2;
380
0
  xch->ignore = 0;
381
382
0
  return xch;
383
0
}
384
385
386
static int recs_match(xrecord_t *rec1, xrecord_t *rec2)
387
0
{
388
0
  return rec1->minimal_perfect_hash == rec2->minimal_perfect_hash;
389
0
}
390
391
/*
392
 * If a line is indented more than this, get_indent() just returns this value.
393
 * This avoids having to do absurd amounts of work for data that are not
394
 * human-readable text, and also ensures that the output of get_indent fits
395
 * within an int.
396
 */
397
0
#define MAX_INDENT 200
398
399
/*
400
 * Return the amount of indentation of the specified line, treating TAB as 8
401
 * columns. Return -1 if line is empty or contains only whitespace. Clamp the
402
 * output value at MAX_INDENT.
403
 */
404
static int get_indent(xrecord_t *rec)
405
0
{
406
0
  int ret = 0;
407
408
0
  for (size_t i = 0; i < rec->size; i++) {
409
0
    char c = (char) rec->ptr[i];
410
411
0
    if (!XDL_ISSPACE(c))
412
0
      return ret;
413
0
    else if (c == ' ')
414
0
      ret += 1;
415
0
    else if (c == '\t')
416
0
      ret += 8 - ret % 8;
417
    /* ignore other whitespace characters */
418
419
0
    if (ret >= MAX_INDENT)
420
0
      return MAX_INDENT;
421
0
  }
422
423
  /* The line contains only whitespace. */
424
0
  return -1;
425
0
}
426
427
/*
428
 * If more than this number of consecutive blank rows are found, just return
429
 * this value. This avoids requiring O(N^2) work for pathological cases, and
430
 * also ensures that the output of score_split fits in an int.
431
 */
432
0
#define MAX_BLANKS 20
433
434
/* Characteristics measured about a hypothetical split position. */
435
struct split_measurement {
436
  /*
437
   * Is the split at the end of the file (aside from any blank lines)?
438
   */
439
  int end_of_file;
440
441
  /*
442
   * How much is the line immediately following the split indented (or -1
443
   * if the line is blank):
444
   */
445
  int indent;
446
447
  /*
448
   * How many consecutive lines above the split are blank?
449
   */
450
  int pre_blank;
451
452
  /*
453
   * How much is the nearest non-blank line above the split indented (or
454
   * -1 if there is no such line)?
455
   */
456
  int pre_indent;
457
458
  /*
459
   * How many lines after the line following the split are blank?
460
   */
461
  int post_blank;
462
463
  /*
464
   * How much is the nearest non-blank line after the line following the
465
   * split indented (or -1 if there is no such line)?
466
   */
467
  int post_indent;
468
};
469
470
struct split_score {
471
  /* The effective indent of this split (smaller is preferred). */
472
  int effective_indent;
473
474
  /* Penalty for this split (smaller is preferred). */
475
  int penalty;
476
};
477
478
/*
479
 * Fill m with information about a hypothetical split of xdf above line split.
480
 */
481
static void measure_split(const xdfile_t *xdf, long split,
482
        struct split_measurement *m)
483
0
{
484
0
  long i;
485
486
0
  if (split >= (long)xdf->nrec) {
487
0
    m->end_of_file = 1;
488
0
    m->indent = -1;
489
0
  } else {
490
0
    m->end_of_file = 0;
491
0
    m->indent = get_indent(&xdf->recs[split]);
492
0
  }
493
494
0
  m->pre_blank = 0;
495
0
  m->pre_indent = -1;
496
0
  for (i = split - 1; i >= 0; i--) {
497
0
    m->pre_indent = get_indent(&xdf->recs[i]);
498
0
    if (m->pre_indent != -1)
499
0
      break;
500
0
    m->pre_blank += 1;
501
0
    if (m->pre_blank == MAX_BLANKS) {
502
0
      m->pre_indent = 0;
503
0
      break;
504
0
    }
505
0
  }
506
507
0
  m->post_blank = 0;
508
0
  m->post_indent = -1;
509
0
  for (i = split + 1; i < (long)xdf->nrec; i++) {
510
0
    m->post_indent = get_indent(&xdf->recs[i]);
511
0
    if (m->post_indent != -1)
512
0
      break;
513
0
    m->post_blank += 1;
514
0
    if (m->post_blank == MAX_BLANKS) {
515
0
      m->post_indent = 0;
516
0
      break;
517
0
    }
518
0
  }
519
0
}
520
521
/*
522
 * The empirically-determined weight factors used by score_split() below.
523
 * Larger values means that the position is a less favorable place to split.
524
 *
525
 * Note that scores are only ever compared against each other, so multiplying
526
 * all of these weight/penalty values by the same factor wouldn't change the
527
 * heuristic's behavior. Still, we need to set that arbitrary scale *somehow*.
528
 * In practice, these numbers are chosen to be large enough that they can be
529
 * adjusted relative to each other with sufficient precision despite using
530
 * integer math.
531
 */
532
533
/* Penalty if there are no non-blank lines before the split */
534
0
#define START_OF_FILE_PENALTY 1
535
536
/* Penalty if there are no non-blank lines after the split */
537
0
#define END_OF_FILE_PENALTY 21
538
539
/* Multiplier for the number of blank lines around the split */
540
0
#define TOTAL_BLANK_WEIGHT (-30)
541
542
/* Multiplier for the number of blank lines after the split */
543
0
#define POST_BLANK_WEIGHT 6
544
545
/*
546
 * Penalties applied if the line is indented more than its predecessor
547
 */
548
0
#define RELATIVE_INDENT_PENALTY (-4)
549
0
#define RELATIVE_INDENT_WITH_BLANK_PENALTY 10
550
551
/*
552
 * Penalties applied if the line is indented less than both its predecessor and
553
 * its successor
554
 */
555
0
#define RELATIVE_OUTDENT_PENALTY 24
556
0
#define RELATIVE_OUTDENT_WITH_BLANK_PENALTY 17
557
558
/*
559
 * Penalties applied if the line is indented less than its predecessor but not
560
 * less than its successor
561
 */
562
0
#define RELATIVE_DEDENT_PENALTY 23
563
0
#define RELATIVE_DEDENT_WITH_BLANK_PENALTY 17
564
565
/*
566
 * We only consider whether the sum of the effective indents for splits are
567
 * less than (-1), equal to (0), or greater than (+1) each other. The resulting
568
 * value is multiplied by the following weight and combined with the penalty to
569
 * determine the better of two scores.
570
 */
571
0
#define INDENT_WEIGHT 60
572
573
/*
574
 * How far do we slide a hunk at most?
575
 */
576
0
#define INDENT_HEURISTIC_MAX_SLIDING 100
577
578
/*
579
 * Compute a badness score for the hypothetical split whose measurements are
580
 * stored in m. The weight factors were determined empirically using the tools
581
 * and corpus described in
582
 *
583
 *     https://github.com/mhagger/diff-slider-tools
584
 *
585
 * Also see that project if you want to improve the weights based on, for
586
 * example, a larger or more diverse corpus.
587
 */
588
static void score_add_split(const struct split_measurement *m, struct split_score *s)
589
0
{
590
  /*
591
   * A place to accumulate penalty factors (positive makes this index more
592
   * favored):
593
   */
594
0
  int post_blank, total_blank, indent, any_blanks;
595
596
0
  if (m->pre_indent == -1 && m->pre_blank == 0)
597
0
    s->penalty += START_OF_FILE_PENALTY;
598
599
0
  if (m->end_of_file)
600
0
    s->penalty += END_OF_FILE_PENALTY;
601
602
  /*
603
   * Set post_blank to the number of blank lines following the split,
604
   * including the line immediately after the split:
605
   */
606
0
  post_blank = (m->indent == -1) ? 1 + m->post_blank : 0;
607
0
  total_blank = m->pre_blank + post_blank;
608
609
  /* Penalties based on nearby blank lines: */
610
0
  s->penalty += TOTAL_BLANK_WEIGHT * total_blank;
611
0
  s->penalty += POST_BLANK_WEIGHT * post_blank;
612
613
0
  if (m->indent != -1)
614
0
    indent = m->indent;
615
0
  else
616
0
    indent = m->post_indent;
617
618
0
  any_blanks = (total_blank != 0);
619
620
  /* Note that the effective indent is -1 at the end of the file: */
621
0
  s->effective_indent += indent;
622
623
0
  if (indent == -1) {
624
    /* No additional adjustments needed. */
625
0
  } else if (m->pre_indent == -1) {
626
    /* No additional adjustments needed. */
627
0
  } else if (indent > m->pre_indent) {
628
    /*
629
     * The line is indented more than its predecessor.
630
     */
631
0
    s->penalty += any_blanks ?
632
0
      RELATIVE_INDENT_WITH_BLANK_PENALTY :
633
0
      RELATIVE_INDENT_PENALTY;
634
0
  } else if (indent == m->pre_indent) {
635
    /*
636
     * The line has the same indentation level as its predecessor.
637
     * No additional adjustments needed.
638
     */
639
0
  } else {
640
    /*
641
     * The line is indented less than its predecessor. It could be
642
     * the block terminator of the previous block, but it could
643
     * also be the start of a new block (e.g., an "else" block, or
644
     * maybe the previous block didn't have a block terminator).
645
     * Try to distinguish those cases based on what comes next:
646
     */
647
0
    if (m->post_indent != -1 && m->post_indent > indent) {
648
      /*
649
       * The following line is indented more. So it is likely
650
       * that this line is the start of a block.
651
       */
652
0
      s->penalty += any_blanks ?
653
0
        RELATIVE_OUTDENT_WITH_BLANK_PENALTY :
654
0
        RELATIVE_OUTDENT_PENALTY;
655
0
    } else {
656
      /*
657
       * That was probably the end of a block.
658
       */
659
0
      s->penalty += any_blanks ?
660
0
        RELATIVE_DEDENT_WITH_BLANK_PENALTY :
661
0
        RELATIVE_DEDENT_PENALTY;
662
0
    }
663
0
  }
664
0
}
665
666
static int score_cmp(struct split_score *s1, struct split_score *s2)
667
0
{
668
  /* -1 if s1.effective_indent < s2->effective_indent, etc. */
669
0
  int cmp_indents = ((s1->effective_indent > s2->effective_indent) -
670
0
         (s1->effective_indent < s2->effective_indent));
671
672
0
  return INDENT_WEIGHT * cmp_indents + (s1->penalty - s2->penalty);
673
0
}
674
675
/*
676
 * Represent a group of changed lines in an xdfile_t (i.e., a contiguous group
677
 * of lines that was inserted or deleted from the corresponding version of the
678
 * file). We consider there to be such a group at the beginning of the file, at
679
 * the end of the file, and between any two unchanged lines, though most such
680
 * groups will usually be empty.
681
 *
682
 * If the first line in a group is equal to the line following the group, then
683
 * the group can be slid down. Similarly, if the last line in a group is equal
684
 * to the line preceding the group, then the group can be slid up. See
685
 * group_slide_down() and group_slide_up().
686
 *
687
 * Note that loops that are testing for changed lines in xdf->rchg do not need
688
 * index bounding since the array is prepared with a zero at position -1 and N.
689
 */
690
struct xdlgroup {
691
  /*
692
   * The index of the first changed line in the group, or the index of
693
   * the unchanged line above which the (empty) group is located.
694
   */
695
  long start;
696
697
  /*
698
   * The index of the first unchanged line after the group. For an empty
699
   * group, end is equal to start.
700
   */
701
  long end;
702
};
703
704
/*
705
 * Initialize g to point at the first group in xdf.
706
 */
707
static void group_init(xdfile_t *xdf, struct xdlgroup *g)
708
0
{
709
0
  g->start = g->end = 0;
710
0
  while (xdf->changed[g->end])
711
0
    g->end++;
712
0
}
713
714
/*
715
 * Move g to describe the next (possibly empty) group in xdf and return 0. If g
716
 * is already at the end of the file, do nothing and return -1.
717
 */
718
static inline int group_next(xdfile_t *xdf, struct xdlgroup *g)
719
0
{
720
0
  if (g->end == (long)xdf->nrec)
721
0
    return -1;
722
723
0
  g->start = g->end + 1;
724
0
  for (g->end = g->start; xdf->changed[g->end]; g->end++)
725
0
    ;
726
727
0
  return 0;
728
0
}
729
730
/*
731
 * Move g to describe the previous (possibly empty) group in xdf and return 0.
732
 * If g is already at the beginning of the file, do nothing and return -1.
733
 */
734
static inline int group_previous(xdfile_t *xdf, struct xdlgroup *g)
735
0
{
736
0
  if (g->start == 0)
737
0
    return -1;
738
739
0
  g->end = g->start - 1;
740
0
  for (g->start = g->end; xdf->changed[g->start - 1]; g->start--)
741
0
    ;
742
743
0
  return 0;
744
0
}
745
746
/*
747
 * If g can be slid toward the end of the file, do so, and if it bumps into a
748
 * following group, expand this group to include it. Return 0 on success or -1
749
 * if g cannot be slid down.
750
 */
751
static int group_slide_down(xdfile_t *xdf, struct xdlgroup *g)
752
0
{
753
0
  if (g->end < (long)xdf->nrec &&
754
0
      recs_match(&xdf->recs[g->start], &xdf->recs[g->end])) {
755
0
    xdf->changed[g->start++] = false;
756
0
    xdf->changed[g->end++] = true;
757
758
0
    while (xdf->changed[g->end])
759
0
      g->end++;
760
761
0
    return 0;
762
0
  } else {
763
0
    return -1;
764
0
  }
765
0
}
766
767
/*
768
 * If g can be slid toward the beginning of the file, do so, and if it bumps
769
 * into a previous group, expand this group to include it. Return 0 on success
770
 * or -1 if g cannot be slid up.
771
 */
772
static int group_slide_up(xdfile_t *xdf, struct xdlgroup *g)
773
0
{
774
0
  if (g->start > 0 &&
775
0
      recs_match(&xdf->recs[g->start - 1], &xdf->recs[g->end - 1])) {
776
0
    xdf->changed[--g->start] = true;
777
0
    xdf->changed[--g->end] = false;
778
779
0
    while (xdf->changed[g->start - 1])
780
0
      g->start--;
781
782
0
    return 0;
783
0
  } else {
784
0
    return -1;
785
0
  }
786
0
}
787
788
/*
789
 * Move back and forward change groups for a consistent and pretty diff output.
790
 * This also helps in finding joinable change groups and reducing the diff
791
 * size.
792
 */
793
0
int xdl_change_compact(xdfile_t *xdf, xdfile_t *xdfo, long flags) {
794
0
  struct xdlgroup g, go;
795
0
  long earliest_end, end_matching_other;
796
0
  long groupsize;
797
798
0
  group_init(xdf, &g);
799
0
  group_init(xdfo, &go);
800
801
0
  while (1) {
802
    /*
803
     * If the group is empty in the to-be-compacted file, skip it:
804
     */
805
0
    if (g.end == g.start)
806
0
      goto next;
807
808
    /*
809
     * Now shift the change up and then down as far as possible in
810
     * each direction. If it bumps into any other changes, merge
811
     * them.
812
     */
813
0
    do {
814
0
      groupsize = g.end - g.start;
815
816
      /*
817
       * Keep track of the last "end" index that causes this
818
       * group to align with a group of changed lines in the
819
       * other file. -1 indicates that we haven't found such
820
       * a match yet:
821
       */
822
0
      end_matching_other = -1;
823
824
      /* Shift the group backward as much as possible: */
825
0
      while (!group_slide_up(xdf, &g))
826
0
        if (group_previous(xdfo, &go))
827
0
          BUG("group sync broken sliding up");
828
829
      /*
830
       * This is this highest that this group can be shifted.
831
       * Record its end index:
832
       */
833
0
      earliest_end = g.end;
834
835
0
      if (go.end > go.start)
836
0
        end_matching_other = g.end;
837
838
      /* Now shift the group forward as far as possible: */
839
0
      while (1) {
840
0
        if (group_slide_down(xdf, &g))
841
0
          break;
842
0
        if (group_next(xdfo, &go))
843
0
          BUG("group sync broken sliding down");
844
845
0
        if (go.end > go.start)
846
0
          end_matching_other = g.end;
847
0
      }
848
0
    } while (groupsize != g.end - g.start);
849
850
    /*
851
     * If the group can be shifted, then we can possibly use this
852
     * freedom to produce a more intuitive diff.
853
     *
854
     * The group is currently shifted as far down as possible, so
855
     * the heuristics below only have to handle upwards shifts.
856
     */
857
858
0
    if (g.end == earliest_end) {
859
      /* no shifting was possible */
860
0
    } else if (end_matching_other != -1) {
861
      /*
862
       * Move the possibly merged group of changes back to
863
       * line up with the last group of changes from the
864
       * other file that it can align with.
865
       */
866
0
      while (go.end == go.start) {
867
0
        if (group_slide_up(xdf, &g))
868
0
          BUG("match disappeared");
869
0
        if (group_previous(xdfo, &go))
870
0
          BUG("group sync broken sliding to match");
871
0
      }
872
0
    } else if (flags & XDF_INDENT_HEURISTIC) {
873
      /*
874
       * Indent heuristic: a group of pure add/delete lines
875
       * implies two splits, one between the end of the
876
       * "before" context and the start of the group, and
877
       * another between the end of the group and the
878
       * beginning of the "after" context. Some splits are
879
       * aesthetically better and some are worse. We compute
880
       * a badness "score" for each split, and add the scores
881
       * for the two splits to define a "score" for each
882
       * position that the group can be shifted to. Then we
883
       * pick the shift with the lowest score.
884
       */
885
0
      long shift, best_shift = -1;
886
0
      struct split_score best_score;
887
888
0
      shift = earliest_end;
889
0
      if (g.end - groupsize - 1 > shift)
890
0
        shift = g.end - groupsize - 1;
891
0
      if (g.end - INDENT_HEURISTIC_MAX_SLIDING > shift)
892
0
        shift = g.end - INDENT_HEURISTIC_MAX_SLIDING;
893
0
      for (; shift <= g.end; shift++) {
894
0
        struct split_measurement m;
895
0
        struct split_score score = {0, 0};
896
897
0
        measure_split(xdf, shift, &m);
898
0
        score_add_split(&m, &score);
899
0
        measure_split(xdf, shift - groupsize, &m);
900
0
        score_add_split(&m, &score);
901
0
        if (best_shift == -1 ||
902
0
            score_cmp(&score, &best_score) <= 0) {
903
0
          best_score.effective_indent = score.effective_indent;
904
0
          best_score.penalty = score.penalty;
905
0
          best_shift = shift;
906
0
        }
907
0
      }
908
909
0
      while (g.end > best_shift) {
910
0
        if (group_slide_up(xdf, &g))
911
0
          BUG("best shift unreached");
912
0
        if (group_previous(xdfo, &go))
913
0
          BUG("group sync broken sliding to blank line");
914
0
      }
915
0
    }
916
917
0
  next:
918
    /* Move past the just-processed group: */
919
0
    if (group_next(xdf, &g))
920
0
      break;
921
0
    if (group_next(xdfo, &go))
922
0
      BUG("group sync broken moving to next group");
923
0
  }
924
925
0
  if (!group_next(xdfo, &go))
926
0
    BUG("group sync broken at end of file");
927
928
0
  return 0;
929
0
}
930
931
932
0
int xdl_build_script(xdfenv_t *xe, xdchange_t **xscr) {
933
0
  xdchange_t *cscr = NULL, *xch;
934
0
  bool *changed1 = xe->xdf1.changed, *changed2 = xe->xdf2.changed;
935
0
  long i1, i2, l1, l2;
936
937
  /*
938
   * Trivial. Collects "groups" of changes and creates an edit script.
939
   */
940
0
  for (i1 = xe->xdf1.nrec, i2 = xe->xdf2.nrec; i1 >= 0 || i2 >= 0; i1--, i2--)
941
0
    if (changed1[i1 - 1] || changed2[i2 - 1]) {
942
0
      for (l1 = i1; changed1[i1 - 1]; i1--);
943
0
      for (l2 = i2; changed2[i2 - 1]; i2--);
944
945
0
      if (!(xch = xdl_add_change(cscr, i1, i2, l1 - i1, l2 - i2))) {
946
0
        xdl_free_script(cscr);
947
0
        return -1;
948
0
      }
949
0
      cscr = xch;
950
0
    }
951
952
0
  *xscr = cscr;
953
954
0
  return 0;
955
0
}
956
957
958
0
void xdl_free_script(xdchange_t *xscr) {
959
0
  xdchange_t *xch;
960
961
0
  while ((xch = xscr) != NULL) {
962
0
    xscr = xscr->next;
963
0
    xdl_free(xch);
964
0
  }
965
0
}
966
967
static int xdl_call_hunk_func(xdfenv_t *xe UNUSED, xdchange_t *xscr, xdemitcb_t *ecb,
968
            xdemitconf_t const *xecfg)
969
0
{
970
0
  xdchange_t *xch, *xche;
971
972
0
  for (xch = xscr; xch; xch = xche->next) {
973
0
    xche = xdl_get_hunk(&xch, xecfg);
974
0
    if (!xch)
975
0
      break;
976
0
    if (xecfg->hunk_func(xch->i1, xche->i1 + xche->chg1 - xch->i1,
977
0
             xch->i2, xche->i2 + xche->chg2 - xch->i2,
978
0
             ecb->priv) < 0)
979
0
      return -1;
980
0
  }
981
0
  return 0;
982
0
}
983
984
static void xdl_mark_ignorable_lines(xdchange_t *xscr, xdfenv_t *xe, long flags)
985
0
{
986
0
  xdchange_t *xch;
987
988
0
  for (xch = xscr; xch; xch = xch->next) {
989
0
    int ignore = 1;
990
0
    xrecord_t *rec;
991
0
    long i;
992
993
0
    rec = &xe->xdf1.recs[xch->i1];
994
0
    for (i = 0; i < xch->chg1 && ignore; i++)
995
0
      ignore = xdl_blankline((const char *)rec[i].ptr, (long)rec[i].size, flags);
996
997
0
    rec = &xe->xdf2.recs[xch->i2];
998
0
    for (i = 0; i < xch->chg2 && ignore; i++)
999
0
      ignore = xdl_blankline((const char *)rec[i].ptr, (long)rec[i].size, flags);
1000
1001
0
    xch->ignore = ignore;
1002
0
  }
1003
0
}
1004
1005
0
static int record_matches_regex(xrecord_t *rec, xpparam_t const *xpp) {
1006
0
  regmatch_t regmatch;
1007
0
  size_t i;
1008
1009
0
  for (i = 0; i < xpp->ignore_regex_nr; i++)
1010
0
    if (!regexec_buf(xpp->ignore_regex[i], (const char *)rec->ptr, rec->size, 1,
1011
0
         &regmatch, 0))
1012
0
      return 1;
1013
1014
0
  return 0;
1015
0
}
1016
1017
static void xdl_mark_ignorable_regex(xdchange_t *xscr, const xdfenv_t *xe,
1018
             xpparam_t const *xpp)
1019
0
{
1020
0
  xdchange_t *xch;
1021
1022
0
  for (xch = xscr; xch; xch = xch->next) {
1023
0
    xrecord_t *rec;
1024
0
    int ignore = 1;
1025
0
    long i;
1026
1027
    /*
1028
     * Do not override --ignore-blank-lines.
1029
     */
1030
0
    if (xch->ignore)
1031
0
      continue;
1032
1033
0
    rec = &xe->xdf1.recs[xch->i1];
1034
0
    for (i = 0; i < xch->chg1 && ignore; i++)
1035
0
      ignore = record_matches_regex(&rec[i], xpp);
1036
1037
0
    rec = &xe->xdf2.recs[xch->i2];
1038
0
    for (i = 0; i < xch->chg2 && ignore; i++)
1039
0
      ignore = record_matches_regex(&rec[i], xpp);
1040
1041
0
    xch->ignore = ignore;
1042
0
  }
1043
0
}
1044
1045
int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
1046
0
       xdemitconf_t const *xecfg, xdemitcb_t *ecb) {
1047
0
  xdchange_t *xscr;
1048
0
  xdfenv_t xe;
1049
0
  emit_func_t ef = xecfg->hunk_func ? xdl_call_hunk_func : xdl_emit_diff;
1050
1051
0
  if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) {
1052
1053
0
    return -1;
1054
0
  }
1055
0
  if (xdl_change_compact(&xe.xdf1, &xe.xdf2, xpp->flags) < 0 ||
1056
0
      xdl_change_compact(&xe.xdf2, &xe.xdf1, xpp->flags) < 0 ||
1057
0
      xdl_build_script(&xe, &xscr) < 0) {
1058
1059
0
    xdl_free_env(&xe);
1060
0
    return -1;
1061
0
  }
1062
0
  if (xscr) {
1063
0
    if (xpp->flags & XDF_IGNORE_BLANK_LINES)
1064
0
      xdl_mark_ignorable_lines(xscr, &xe, xpp->flags);
1065
1066
0
    if (xpp->ignore_regex)
1067
0
      xdl_mark_ignorable_regex(xscr, &xe, xpp);
1068
1069
0
    if (ef(&xe, xscr, ecb, xecfg) < 0) {
1070
1071
0
      xdl_free_script(xscr);
1072
0
      xdl_free_env(&xe);
1073
0
      return -1;
1074
0
    }
1075
0
    xdl_free_script(xscr);
1076
0
  }
1077
0
  xdl_free_env(&xe);
1078
1079
0
  return 0;
1080
0
}