/src/git/xdiff/xprepare.c

Source
/*
 *  LibXDiff by Davide Libenzi ( File Differential Library )
 *  Copyright (C) 2003  Davide Libenzi
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, see
 *  <http://www.gnu.org/licenses/>.
 *
 *  Davide Libenzi <davidel@xmailserver.org>
 *
 */

#include "xinclude.h"


#define XDL_KPDIS_RUN 4
#define XDL_MAX_EQLIMIT 1024
#define XDL_SIMSCAN_WINDOW 100
#define XDL_GUESS_NLINES1 256
#define XDL_GUESS_NLINES2 20

#define DISCARD 0
#define KEEP 1
#define INVESTIGATE 2

typedef struct s_xdlclass {
  uint64_t line_hash;
  struct s_xdlclass *next;
  const uint8_t *ptr;
  size_t size;
  long idx;
  long len1, len2;
} xdlclass_t;

typedef struct s_xdlclassifier {
  unsigned int hbits;
  long hsize;
  xdlclass_t **rchash;
  chastore_t ncha;
  xdlclass_t **rcrecs;
  long alloc;
  long count;
  long flags;
} xdlclassifier_t;




static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) {
  cf->flags = flags;

  cf->hbits = xdl_hashbits((unsigned int) size);
  cf->hsize = 1 << cf->hbits;

  if (xdl_cha_init(&cf->ncha, sizeof(xdlclass_t), size / 4 + 1) < 0) {

    return -1;
  }
  if (!XDL_CALLOC_ARRAY(cf->rchash, cf->hsize)) {

    xdl_cha_free(&cf->ncha);
    return -1;
  }

  cf->alloc = size;
  if (!XDL_ALLOC_ARRAY(cf->rcrecs, cf->alloc)) {

    xdl_free(cf->rchash);
    xdl_cha_free(&cf->ncha);
    return -1;
  }

  cf->count = 0;

  return 0;
}


static void xdl_free_classifier(xdlclassifier_t *cf) {

  xdl_free(cf->rcrecs);
  xdl_free(cf->rchash);
  xdl_cha_free(&cf->ncha);
}


static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t *rec,
             uint64_t line_hash) {
  size_t hi;
  xdlclass_t *rcrec;

  hi = XDL_HASHLONG(line_hash, cf->hbits);
  for (rcrec = cf->rchash[hi]; rcrec; rcrec = rcrec->next)
    if (rcrec->line_hash == line_hash &&
        xdl_recmatch((const char *)rcrec->ptr, (long)rcrec->size,
          (const char *)rec->ptr, (long)rec->size, cf->flags))
      break;

  if (!rcrec) {
    if (!(rcrec = xdl_cha_alloc(&cf->ncha))) {

      return -1;
    }
    rcrec->idx = cf->count++;
    if (XDL_ALLOC_GROW(cf->rcrecs, cf->count, cf->alloc))
        return -1;
    cf->rcrecs[rcrec->idx] = rcrec;
    rcrec->line_hash = line_hash;
    rcrec->ptr = rec->ptr;
    rcrec->size = rec->size;
    rcrec->len1 = rcrec->len2 = 0;
    rcrec->next = cf->rchash[hi];
    cf->rchash[hi] = rcrec;
  }

  (pass == 1) ? rcrec->len1++ : rcrec->len2++;

  rec->minimal_perfect_hash = (size_t)rcrec->idx;

  return 0;
}


static void xdl_free_ctx(xdfile_t *xdf)
{
  xdl_free(xdf->reference_index);
  xdl_free(xdf->changed - 1);
  xdl_free(xdf->recs);
}


static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_t const *xpp,
         xdlclassifier_t *cf, xdfile_t *xdf) {
  long bsize;
  uint64_t hav;
  uint8_t const *blk, *cur, *top, *prev;
  xrecord_t *crec;

  xdf->reference_index = NULL;
  xdf->changed = NULL;
  xdf->recs = NULL;

  if (!XDL_ALLOC_ARRAY(xdf->recs, narec))
    goto abort;

  xdf->nrec = 0;
  if ((cur = blk = xdl_mmfile_first(mf, &bsize))) {
    for (top = blk + bsize; cur < top; ) {
      prev = cur;
      hav = xdl_hash_record(&cur, top, xpp->flags);
      if (XDL_ALLOC_GROW(xdf->recs, (long)xdf->nrec + 1, narec))
        goto abort;
      crec = &xdf->recs[xdf->nrec++];
      crec->ptr = prev;
      crec->size = cur - prev;
      if (xdl_classify_record(pass, cf, crec, hav) < 0)
        goto abort;
    }
  }

  if (!XDL_CALLOC_ARRAY(xdf->changed, xdf->nrec + 2))
    goto abort;

  if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
      (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF)) {
    if (!XDL_ALLOC_ARRAY(xdf->reference_index, xdf->nrec + 1))
      goto abort;
  }

  xdf->changed += 1;
  xdf->nreff = 0;
  xdf->dstart = 0;
  xdf->dend = xdf->nrec - 1;

  return 0;

abort:
  xdl_free_ctx(xdf);
  return -1;
}


void xdl_free_env(xdfenv_t *xe) {

  xdl_free_ctx(&xe->xdf2);
  xdl_free_ctx(&xe->xdf1);
}


static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) {
  long r, rdis0, rpdis0, rdis1, rpdis1;

  /*
   * Limits the window that is examined during the similar-lines
   * scan. The loops below stops when action[i - r] == KEEP
   * (line that has no match), but there are corner cases where
   * the loop proceed all the way to the extremities by causing
   * huge performance penalties in case of big files.
   */
  if (i - s > XDL_SIMSCAN_WINDOW)
    s = i - XDL_SIMSCAN_WINDOW;
  if (e - i > XDL_SIMSCAN_WINDOW)
    e = i + XDL_SIMSCAN_WINDOW;

  /*
   * Scans the lines before 'i' to find a run of lines that either
   * have no match (action[j] == DISCARD) or have multiple matches
   * (action[j] == INVESTIGATE). Note that we always call this
   * function with action[i] == INVESTIGATE, so the current line
   * (i) is already a multimatch line.
   */
  for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) {
    if (action[i - r] == DISCARD)
      rdis0++;
    else if (action[i - r] == INVESTIGATE)
      rpdis0++;
    else if (action[i - r] == KEEP)
      break;
    else
      BUG("Illegal value for action[i - r]");
  }
  /*
   * If the run before the line 'i' found only multimatch lines,
   * we return false and hence we don't make the current line (i)
   * discarded. We want to discard multimatch lines only when
   * they appear in the middle of runs with nomatch lines
   * (action[j] == DISCARD).
   */
  if (rdis0 == 0)
    return 0;
  for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) {
    if (action[i + r] == DISCARD)
      rdis1++;
    else if (action[i + r] == INVESTIGATE)
      rpdis1++;
    else if (action[i + r] == KEEP)
      break;
    else
      BUG("Illegal value for action[i + r]");
  }
  /*
   * If the run after the line 'i' found only multimatch lines,
   * we return false and hence we don't make the current line (i)
   * discarded.
   */
  if (rdis1 == 0)
    return false;
  rdis1 += rdis0;
  rpdis1 += rpdis0;

  return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1);
}


/*
 * Try to reduce the problem complexity, discard records that have no
 * matches on the other file. Also, lines that have multiple matches
 * might be potentially discarded if they appear in a run of discardable.
 */
static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
  long i, nm, mlim;
  xrecord_t *recs;
  xdlclass_t *rcrec;
  uint8_t *action1 = NULL, *action2 = NULL;
  bool need_min = !!(cf->flags & XDF_NEED_MINIMAL);
  int ret = 0;

  /*
   * Create temporary arrays that will help us decide if
   * changed[i] should remain false, or become true.
   */
  if (!XDL_CALLOC_ARRAY(action1, xdf1->nrec + 1)) {
    ret = -1;
    goto cleanup;
  }
  if (!XDL_CALLOC_ARRAY(action2, xdf2->nrec + 1)) {
    ret = -1;
    goto cleanup;
  }

  /*
   * Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE.
   */
  if ((mlim = xdl_bogosqrt((long)xdf1->nrec)) > XDL_MAX_EQLIMIT)
    mlim = XDL_MAX_EQLIMIT;
  for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
    rcrec = cf->rcrecs[recs->minimal_perfect_hash];
    nm = rcrec ? rcrec->len2 : 0;
    action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
  }

  if ((mlim = xdl_bogosqrt((long)xdf2->nrec)) > XDL_MAX_EQLIMIT)
    mlim = XDL_MAX_EQLIMIT;
  for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
    rcrec = cf->rcrecs[recs->minimal_perfect_hash];
    nm = rcrec ? rcrec->len1 : 0;
    action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
  }

  /*
   * Use temporary arrays to decide if changed[i] should remain
   * false, or become true.
   */
  xdf1->nreff = 0;
  for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];
       i <= xdf1->dend; i++, recs++) {
    if (action1[i] == KEEP ||
        (action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xdf1->dstart, xdf1->dend))) {
      xdf1->reference_index[xdf1->nreff++] = i;
      /* changed[i] remains false, i.e. keep */
    } else
      xdf1->changed[i] = true;
      /* i.e. discard */
  }

  xdf2->nreff = 0;
  for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart];
       i <= xdf2->dend; i++, recs++) {
    if (action2[i] == KEEP ||
        (action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xdf2->dstart, xdf2->dend))) {
      xdf2->reference_index[xdf2->nreff++] = i;
      /* changed[i] remains false, i.e. keep */
    } else
      xdf2->changed[i] = true;
      /* i.e. discard */
  }

cleanup:
  xdl_free(action1);
  xdl_free(action2);

  return ret;
}


/*
 * Early trim initial and terminal matching records.
 */
static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) {
  long i, lim;
  xrecord_t *recs1, *recs2;

  recs1 = xdf1->recs;
  recs2 = xdf2->recs;
  for (i = 0, lim = (long)XDL_MIN(xdf1->nrec, xdf2->nrec); i < lim;
       i++, recs1++, recs2++)
    if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash)
      break;

  xdf1->dstart = xdf2->dstart = i;

  recs1 = xdf1->recs + xdf1->nrec - 1;
  recs2 = xdf2->recs + xdf2->nrec - 1;
  for (lim -= i, i = 0; i < lim; i++, recs1--, recs2--)
    if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash)
      break;

  xdf1->dend = (long)xdf1->nrec - i - 1;
  xdf2->dend = (long)xdf2->nrec - i - 1;

  return 0;
}


static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {

  if (xdl_trim_ends(xdf1, xdf2) < 0 ||
      xdl_cleanup_records(cf, xdf1, xdf2) < 0) {

    return -1;
  }

  return 0;
}

int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
        xdfenv_t *xe) {
  long enl1, enl2, sample;
  xdlclassifier_t cf;

  memset(&cf, 0, sizeof(cf));

  /*
   * For histogram diff, we can afford a smaller sample size and
   * thus a poorer estimate of the number of lines, as the hash
   * table (rhash) won't be filled up/grown. The number of lines
   * (nrecs) will be updated correctly anyway by
   * xdl_prepare_ctx().
   */
  sample = (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF
      ? XDL_GUESS_NLINES2 : XDL_GUESS_NLINES1);

  enl1 = xdl_guess_lines(mf1, sample) + 1;
  enl2 = xdl_guess_lines(mf2, sample) + 1;

  if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0)
    return -1;

  if (xdl_prepare_ctx(1, mf1, enl1, xpp, &cf, &xe->xdf1) < 0) {

    xdl_free_classifier(&cf);
    return -1;
  }
  if (xdl_prepare_ctx(2, mf2, enl2, xpp, &cf, &xe->xdf2) < 0) {

    xdl_free_ctx(&xe->xdf1);
    xdl_free_classifier(&cf);
    return -1;
  }

  if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
      (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) &&
      xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) {

    xdl_free_ctx(&xe->xdf2);
    xdl_free_ctx(&xe->xdf1);
    xdl_free_classifier(&cf);
    return -1;
  }

  xdl_free_classifier(&cf);

  return 0;
}

Coverage Report

Created: 2026-03-21 06:46

Line	Count	Source
1		/*
2		* LibXDiff by Davide Libenzi ( File Differential Library )
3		* Copyright (C) 2003 Davide Libenzi
4		*
5		* This library is free software; you can redistribute it and/or
6		* modify it under the terms of the GNU Lesser General Public
7		* License as published by the Free Software Foundation; either
8		* version 2.1 of the License, or (at your option) any later version.
9		*
10		* This library is distributed in the hope that it will be useful,
11		* but WITHOUT ANY WARRANTY; without even the implied warranty of
12		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13		* Lesser General Public License for more details.
14		*
15		* You should have received a copy of the GNU Lesser General Public
16		* License along with this library; if not, see
17		* <http://www.gnu.org/licenses/>.
18		*
19		* Davide Libenzi <davidel@xmailserver.org>
20		*
21		*/
22
23		#include "xinclude.h"
24
25
26	0	#define XDL_KPDIS_RUN 4
27	0	#define XDL_MAX_EQLIMIT 1024
28	0	#define XDL_SIMSCAN_WINDOW 100
29	0	#define XDL_GUESS_NLINES1 256
30	0	#define XDL_GUESS_NLINES2 20
31
32	0	#define DISCARD 0
33	0	#define KEEP 1
34	0	#define INVESTIGATE 2
35
36		typedef struct s_xdlclass {
37		uint64_t line_hash;
38		struct s_xdlclass *next;
39		const uint8_t *ptr;
40		size_t size;
41		long idx;
42		long len1, len2;
43		} xdlclass_t;
44
45		typedef struct s_xdlclassifier {
46		unsigned int hbits;
47		long hsize;
48		xdlclass_t **rchash;
49		chastore_t ncha;
50		xdlclass_t **rcrecs;
51		long alloc;
52		long count;
53		long flags;
54		} xdlclassifier_t;
55
56
57
58
59	0	static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) {
60	0	cf->flags = flags;
61
62	0	cf->hbits = xdl_hashbits((unsigned int) size);
63	0	cf->hsize = 1 << cf->hbits;
64
65	0	if (xdl_cha_init(&cf->ncha, sizeof(xdlclass_t), size / 4 + 1) < 0) {
66
67	0	return -1;
68	0	}
69	0	if (!XDL_CALLOC_ARRAY(cf->rchash, cf->hsize)) {
70
71	0	xdl_cha_free(&cf->ncha);
72	0	return -1;
73	0	}
74
75	0	cf->alloc = size;
76	0	if (!XDL_ALLOC_ARRAY(cf->rcrecs, cf->alloc)) {
77
78	0	xdl_free(cf->rchash);
79	0	xdl_cha_free(&cf->ncha);
80	0	return -1;
81	0	}
82
83	0	cf->count = 0;
84
85	0	return 0;
86	0	}
87
88
89	0	static void xdl_free_classifier(xdlclassifier_t *cf) {
90
91	0	xdl_free(cf->rcrecs);
92	0	xdl_free(cf->rchash);
93	0	xdl_cha_free(&cf->ncha);
94	0	}
95
96
97		static int xdl_classify_record(unsigned int pass, xdlclassifier_t cf, xrecord_t rec,
98	0	uint64_t line_hash) {
99	0	size_t hi;
100	0	xdlclass_t *rcrec;
101
102	0	hi = XDL_HASHLONG(line_hash, cf->hbits);
103	0	for (rcrec = cf->rchash[hi]; rcrec; rcrec = rcrec->next)
104	0	if (rcrec->line_hash == line_hash &&
105	0	xdl_recmatch((const char *)rcrec->ptr, (long)rcrec->size,
106	0	(const char *)rec->ptr, (long)rec->size, cf->flags))
107	0	break;
108
109	0	if (!rcrec) {
110	0	if (!(rcrec = xdl_cha_alloc(&cf->ncha))) {
111
112	0	return -1;
113	0	}
114	0	rcrec->idx = cf->count++;
115	0	if (XDL_ALLOC_GROW(cf->rcrecs, cf->count, cf->alloc))
116	0	return -1;
117	0	cf->rcrecs[rcrec->idx] = rcrec;
118	0	rcrec->line_hash = line_hash;
119	0	rcrec->ptr = rec->ptr;
120	0	rcrec->size = rec->size;
121	0	rcrec->len1 = rcrec->len2 = 0;
122	0	rcrec->next = cf->rchash[hi];
123	0	cf->rchash[hi] = rcrec;
124	0	}
125
126	0	(pass == 1) ? rcrec->len1++ : rcrec->len2++;
127
128	0	rec->minimal_perfect_hash = (size_t)rcrec->idx;
129
130	0	return 0;
131	0	}
132
133
134		static void xdl_free_ctx(xdfile_t *xdf)
135	0	{
136	0	xdl_free(xdf->reference_index);
137	0	xdl_free(xdf->changed - 1);
138	0	xdl_free(xdf->recs);
139	0	}
140
141
142		static int xdl_prepare_ctx(unsigned int pass, mmfile_t mf, long narec, xpparam_t const xpp,
143	0	xdlclassifier_t cf, xdfile_t xdf) {
144	0	long bsize;
145	0	uint64_t hav;
146	0	uint8_t const blk, cur, top, prev;
147	0	xrecord_t *crec;
148
149	0	xdf->reference_index = NULL;
150	0	xdf->changed = NULL;
151	0	xdf->recs = NULL;
152
153	0	if (!XDL_ALLOC_ARRAY(xdf->recs, narec))
154	0	goto abort;
155
156	0	xdf->nrec = 0;
157	0	if ((cur = blk = xdl_mmfile_first(mf, &bsize))) {
158	0	for (top = blk + bsize; cur < top; ) {
159	0	prev = cur;
160	0	hav = xdl_hash_record(&cur, top, xpp->flags);
161	0	if (XDL_ALLOC_GROW(xdf->recs, (long)xdf->nrec + 1, narec))
162	0	goto abort;
163	0	crec = &xdf->recs[xdf->nrec++];
164	0	crec->ptr = prev;
165	0	crec->size = cur - prev;
166	0	if (xdl_classify_record(pass, cf, crec, hav) < 0)
167	0	goto abort;
168	0	}
169	0	}
170
171	0	if (!XDL_CALLOC_ARRAY(xdf->changed, xdf->nrec + 2))
172	0	goto abort;
173
174	0	if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
175	0	(XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF)) {
176	0	if (!XDL_ALLOC_ARRAY(xdf->reference_index, xdf->nrec + 1))
177	0	goto abort;
178	0	}
179
180	0	xdf->changed += 1;
181	0	xdf->nreff = 0;
182	0	xdf->dstart = 0;
183	0	xdf->dend = xdf->nrec - 1;
184
185	0	return 0;
186
187	0	abort:
188	0	xdl_free_ctx(xdf);
189	0	return -1;
190	0	}
191
192
193	0	void xdl_free_env(xdfenv_t *xe) {
194
195	0	xdl_free_ctx(&xe->xdf2);
196	0	xdl_free_ctx(&xe->xdf1);
197	0	}
198
199
200	0	static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) {
201	0	long r, rdis0, rpdis0, rdis1, rpdis1;
202
203		/*
204		* Limits the window that is examined during the similar-lines
205		* scan. The loops below stops when action[i - r] == KEEP
206		* (line that has no match), but there are corner cases where
207		* the loop proceed all the way to the extremities by causing
208		* huge performance penalties in case of big files.
209		*/
210	0	if (i - s > XDL_SIMSCAN_WINDOW)
211	0	s = i - XDL_SIMSCAN_WINDOW;
212	0	if (e - i > XDL_SIMSCAN_WINDOW)
213	0	e = i + XDL_SIMSCAN_WINDOW;
214
215		/*
216		* Scans the lines before 'i' to find a run of lines that either
217		* have no match (action[j] == DISCARD) or have multiple matches
218		* (action[j] == INVESTIGATE). Note that we always call this
219		* function with action[i] == INVESTIGATE, so the current line
220		* (i) is already a multimatch line.
221		*/
222	0	for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) {
223	0	if (action[i - r] == DISCARD)
224	0	rdis0++;
225	0	else if (action[i - r] == INVESTIGATE)
226	0	rpdis0++;
227	0	else if (action[i - r] == KEEP)
228	0	break;
229	0	else
230	0	BUG("Illegal value for action[i - r]");
231	0	}
232		/*
233		* If the run before the line 'i' found only multimatch lines,
234		* we return false and hence we don't make the current line (i)
235		* discarded. We want to discard multimatch lines only when
236		* they appear in the middle of runs with nomatch lines
237		* (action[j] == DISCARD).
238		*/
239	0	if (rdis0 == 0)
240	0	return 0;
241	0	for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) {
242	0	if (action[i + r] == DISCARD)
243	0	rdis1++;
244	0	else if (action[i + r] == INVESTIGATE)
245	0	rpdis1++;
246	0	else if (action[i + r] == KEEP)
247	0	break;
248	0	else
249	0	BUG("Illegal value for action[i + r]");
250	0	}
251		/*
252		* If the run after the line 'i' found only multimatch lines,
253		* we return false and hence we don't make the current line (i)
254		* discarded.
255		*/
256	0	if (rdis1 == 0)
257	0	return false;
258	0	rdis1 += rdis0;
259	0	rpdis1 += rpdis0;
260
261	0	return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1);
262	0	}
263
264
265		/*
266		* Try to reduce the problem complexity, discard records that have no
267		* matches on the other file. Also, lines that have multiple matches
268		* might be potentially discarded if they appear in a run of discardable.
269		*/
270	0	static int xdl_cleanup_records(xdlclassifier_t cf, xdfile_t xdf1, xdfile_t *xdf2) {
271	0	long i, nm, mlim;
272	0	xrecord_t *recs;
273	0	xdlclass_t *rcrec;
274	0	uint8_t action1 = NULL, action2 = NULL;
275	0	bool need_min = !!(cf->flags & XDF_NEED_MINIMAL);
276	0	int ret = 0;
277
278		/*
279		* Create temporary arrays that will help us decide if
280		* changed[i] should remain false, or become true.
281		*/
282	0	if (!XDL_CALLOC_ARRAY(action1, xdf1->nrec + 1)) {
283	0	ret = -1;
284	0	goto cleanup;
285	0	}
286	0	if (!XDL_CALLOC_ARRAY(action2, xdf2->nrec + 1)) {
287	0	ret = -1;
288	0	goto cleanup;
289	0	}
290
291		/*
292		* Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE.
293		*/
294	0	if ((mlim = xdl_bogosqrt((long)xdf1->nrec)) > XDL_MAX_EQLIMIT)
295	0	mlim = XDL_MAX_EQLIMIT;
296	0	for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
297	0	rcrec = cf->rcrecs[recs->minimal_perfect_hash];
298	0	nm = rcrec ? rcrec->len2 : 0;
299	0	action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
300	0	}
301
302	0	if ((mlim = xdl_bogosqrt((long)xdf2->nrec)) > XDL_MAX_EQLIMIT)
303	0	mlim = XDL_MAX_EQLIMIT;
304	0	for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
305	0	rcrec = cf->rcrecs[recs->minimal_perfect_hash];
306	0	nm = rcrec ? rcrec->len1 : 0;
307	0	action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
308	0	}
309
310		/*
311		* Use temporary arrays to decide if changed[i] should remain
312		* false, or become true.
313		*/
314	0	xdf1->nreff = 0;
315	0	for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];
316	0	i <= xdf1->dend; i++, recs++) {
317	0	if (action1[i] == KEEP \|\|
318	0	(action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xdf1->dstart, xdf1->dend))) {
319	0	xdf1->reference_index[xdf1->nreff++] = i;
320		/* changed[i] remains false, i.e. keep */
321	0	} else
322	0	xdf1->changed[i] = true;
323		/* i.e. discard */
324	0	}
325
326	0	xdf2->nreff = 0;
327	0	for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart];
328	0	i <= xdf2->dend; i++, recs++) {
329	0	if (action2[i] == KEEP \|\|
330	0	(action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xdf2->dstart, xdf2->dend))) {
331	0	xdf2->reference_index[xdf2->nreff++] = i;
332		/* changed[i] remains false, i.e. keep */
333	0	} else
334	0	xdf2->changed[i] = true;
335		/* i.e. discard */
336	0	}
337
338	0	cleanup:
339	0	xdl_free(action1);
340	0	xdl_free(action2);
341
342	0	return ret;
343	0	}
344
345
346		/*
347		* Early trim initial and terminal matching records.
348		*/
349	0	static int xdl_trim_ends(xdfile_t xdf1, xdfile_t xdf2) {
350	0	long i, lim;
351	0	xrecord_t recs1, recs2;
352
353	0	recs1 = xdf1->recs;
354	0	recs2 = xdf2->recs;
355	0	for (i = 0, lim = (long)XDL_MIN(xdf1->nrec, xdf2->nrec); i < lim;
356	0	i++, recs1++, recs2++)
357	0	if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash)
358	0	break;
359
360	0	xdf1->dstart = xdf2->dstart = i;
361
362	0	recs1 = xdf1->recs + xdf1->nrec - 1;
363	0	recs2 = xdf2->recs + xdf2->nrec - 1;
364	0	for (lim -= i, i = 0; i < lim; i++, recs1--, recs2--)
365	0	if (recs1->minimal_perfect_hash != recs2->minimal_perfect_hash)
366	0	break;
367
368	0	xdf1->dend = (long)xdf1->nrec - i - 1;
369	0	xdf2->dend = (long)xdf2->nrec - i - 1;
370
371	0	return 0;
372	0	}
373
374
375	0	static int xdl_optimize_ctxs(xdlclassifier_t cf, xdfile_t xdf1, xdfile_t *xdf2) {
376
377	0	if (xdl_trim_ends(xdf1, xdf2) < 0 \|\|
378	0	xdl_cleanup_records(cf, xdf1, xdf2) < 0) {
379
380	0	return -1;
381	0	}
382
383	0	return 0;
384	0	}
385
386		int xdl_prepare_env(mmfile_t mf1, mmfile_t mf2, xpparam_t const *xpp,
387	0	xdfenv_t *xe) {
388	0	long enl1, enl2, sample;
389	0	xdlclassifier_t cf;
390
391	0	memset(&cf, 0, sizeof(cf));
392
393		/*
394		* For histogram diff, we can afford a smaller sample size and
395		* thus a poorer estimate of the number of lines, as the hash
396		* table (rhash) won't be filled up/grown. The number of lines
397		* (nrecs) will be updated correctly anyway by
398		* xdl_prepare_ctx().
399		*/
400	0	sample = (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF
401	0	? XDL_GUESS_NLINES2 : XDL_GUESS_NLINES1);
402
403	0	enl1 = xdl_guess_lines(mf1, sample) + 1;
404	0	enl2 = xdl_guess_lines(mf2, sample) + 1;
405
406	0	if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0)
407	0	return -1;
408
409	0	if (xdl_prepare_ctx(1, mf1, enl1, xpp, &cf, &xe->xdf1) < 0) {
410
411	0	xdl_free_classifier(&cf);
412	0	return -1;
413	0	}
414	0	if (xdl_prepare_ctx(2, mf2, enl2, xpp, &cf, &xe->xdf2) < 0) {
415
416	0	xdl_free_ctx(&xe->xdf1);
417	0	xdl_free_classifier(&cf);
418	0	return -1;
419	0	}
420
421	0	if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
422	0	(XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) &&
423	0	xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) {
424
425	0	xdl_free_ctx(&xe->xdf2);
426	0	xdl_free_ctx(&xe->xdf1);
427	0	xdl_free_classifier(&cf);
428	0	return -1;
429	0	}
430
431	0	xdl_free_classifier(&cf);
432
433	0	return 0;
434	0	}