Coverage Report

Created: 2024-09-08 06:23

/src/git/diffcore-delta.c
Line
Count
Source (jump to first uncovered line)
1
#include "git-compat-util.h"
2
#include "diffcore.h"
3
4
/*
5
 * Idea here is very simple.
6
 *
7
 * Almost all data we are interested in are text, but sometimes we have
8
 * to deal with binary data.  So we cut them into chunks delimited by
9
 * LF byte, or 64-byte sequence, whichever comes first, and hash them.
10
 *
11
 * For those chunks, if the source buffer has more instances of it
12
 * than the destination buffer, that means the difference are the
13
 * number of bytes not copied from source to destination.  If the
14
 * counts are the same, everything was copied from source to
15
 * destination.  If the destination has more, everything was copied,
16
 * and destination added more.
17
 *
18
 * We are doing an approximation so we do not really have to waste
19
 * memory by actually storing the sequence.  We just hash them into
20
 * somewhere around 2^16 hashbuckets and count the occurrences.
21
 */
22
23
/* Wild guess at the initial hash size */
24
0
#define INITIAL_HASH_SIZE 9
25
26
/* We leave more room in smaller hash but do not let it
27
 * grow to have unused hole too much.
28
 */
29
0
#define INITIAL_FREE(sz_log2) ((1<<(sz_log2))*(sz_log2-3)/(sz_log2))
30
31
/* A prime rather carefully chosen between 2^16..2^17, so that
32
 * HASHBASE < INITIAL_FREE(17).  We want to keep the maximum hashtable
33
 * size under the current 2<<17 maximum, which can hold this many
34
 * different values before overflowing to hashtable of size 2<<18.
35
 */
36
0
#define HASHBASE 107927
37
38
struct spanhash {
39
  unsigned int hashval;
40
  unsigned int cnt;
41
};
42
struct spanhash_top {
43
  int alloc_log2;
44
  int free;
45
  struct spanhash data[FLEX_ARRAY];
46
};
47
48
static struct spanhash_top *spanhash_rehash(struct spanhash_top *orig)
49
0
{
50
0
  struct spanhash_top *new_spanhash;
51
0
  int i;
52
0
  int osz = 1 << orig->alloc_log2;
53
0
  int sz = osz << 1;
54
55
0
  new_spanhash = xmalloc(st_add(sizeof(*orig),
56
0
           st_mult(sizeof(struct spanhash), sz)));
57
0
  new_spanhash->alloc_log2 = orig->alloc_log2 + 1;
58
0
  new_spanhash->free = INITIAL_FREE(new_spanhash->alloc_log2);
59
0
  memset(new_spanhash->data, 0, sizeof(struct spanhash) * sz);
60
0
  for (i = 0; i < osz; i++) {
61
0
    struct spanhash *o = &(orig->data[i]);
62
0
    int bucket;
63
0
    if (!o->cnt)
64
0
      continue;
65
0
    bucket = o->hashval & (sz - 1);
66
0
    while (1) {
67
0
      struct spanhash *h = &(new_spanhash->data[bucket++]);
68
0
      if (!h->cnt) {
69
0
        h->hashval = o->hashval;
70
0
        h->cnt = o->cnt;
71
0
        new_spanhash->free--;
72
0
        break;
73
0
      }
74
0
      if (sz <= bucket)
75
0
        bucket = 0;
76
0
    }
77
0
  }
78
0
  free(orig);
79
0
  return new_spanhash;
80
0
}
81
82
static struct spanhash_top *add_spanhash(struct spanhash_top *top,
83
           unsigned int hashval, int cnt)
84
0
{
85
0
  int bucket, lim;
86
0
  struct spanhash *h;
87
88
0
  lim = (1 << top->alloc_log2);
89
0
  bucket = hashval & (lim - 1);
90
0
  while (1) {
91
0
    h = &(top->data[bucket++]);
92
0
    if (!h->cnt) {
93
0
      h->hashval = hashval;
94
0
      h->cnt = cnt;
95
0
      top->free--;
96
0
      if (top->free < 0)
97
0
        return spanhash_rehash(top);
98
0
      return top;
99
0
    }
100
0
    if (h->hashval == hashval) {
101
0
      h->cnt += cnt;
102
0
      return top;
103
0
    }
104
0
    if (lim <= bucket)
105
0
      bucket = 0;
106
0
  }
107
0
}
108
109
static int spanhash_cmp(const void *a_, const void *b_)
110
0
{
111
0
  const struct spanhash *a = a_;
112
0
  const struct spanhash *b = b_;
113
114
  /* A count of zero compares at the end.. */
115
0
  if (!a->cnt)
116
0
    return !b->cnt ? 0 : 1;
117
0
  if (!b->cnt)
118
0
    return -1;
119
0
  return a->hashval < b->hashval ? -1 :
120
0
    a->hashval > b->hashval ? 1 : 0;
121
0
}
122
123
static struct spanhash_top *hash_chars(struct repository *r,
124
               struct diff_filespec *one)
125
0
{
126
0
  int i, n;
127
0
  unsigned int accum1, accum2, hashval;
128
0
  struct spanhash_top *hash;
129
0
  unsigned char *buf = one->data;
130
0
  unsigned int sz = one->size;
131
0
  int is_text = !diff_filespec_is_binary(r, one);
132
133
0
  i = INITIAL_HASH_SIZE;
134
0
  hash = xmalloc(st_add(sizeof(*hash),
135
0
            st_mult(sizeof(struct spanhash), (size_t)1 << i)));
136
0
  hash->alloc_log2 = i;
137
0
  hash->free = INITIAL_FREE(i);
138
0
  memset(hash->data, 0, sizeof(struct spanhash) * ((size_t)1 << i));
139
140
0
  n = 0;
141
0
  accum1 = accum2 = 0;
142
0
  while (sz) {
143
0
    unsigned int c = *buf++;
144
0
    unsigned int old_1 = accum1;
145
0
    sz--;
146
147
    /* Ignore CR in CRLF sequence if text */
148
0
    if (is_text && c == '\r' && sz && *buf == '\n')
149
0
      continue;
150
151
0
    accum1 = (accum1 << 7) ^ (accum2 >> 25);
152
0
    accum2 = (accum2 << 7) ^ (old_1 >> 25);
153
0
    accum1 += c;
154
0
    if (++n < 64 && c != '\n')
155
0
      continue;
156
0
    hashval = (accum1 + accum2 * 0x61) % HASHBASE;
157
0
    hash = add_spanhash(hash, hashval, n);
158
0
    n = 0;
159
0
    accum1 = accum2 = 0;
160
0
  }
161
0
  if (n > 0) {
162
0
    hashval = (accum1 + accum2 * 0x61) % HASHBASE;
163
0
    hash = add_spanhash(hash, hashval, n);
164
0
  }
165
0
  QSORT(hash->data, (size_t)1ul << hash->alloc_log2, spanhash_cmp);
166
0
  return hash;
167
0
}
168
169
int diffcore_count_changes(struct repository *r,
170
         struct diff_filespec *src,
171
         struct diff_filespec *dst,
172
         void **src_count_p,
173
         void **dst_count_p,
174
         unsigned long *src_copied,
175
         unsigned long *literal_added)
176
0
{
177
0
  struct spanhash *s, *d;
178
0
  struct spanhash_top *src_count, *dst_count;
179
0
  unsigned long sc, la;
180
181
0
  src_count = dst_count = NULL;
182
0
  if (src_count_p)
183
0
    src_count = *src_count_p;
184
0
  if (!src_count) {
185
0
    src_count = hash_chars(r, src);
186
0
    if (src_count_p)
187
0
      *src_count_p = src_count;
188
0
  }
189
0
  if (dst_count_p)
190
0
    dst_count = *dst_count_p;
191
0
  if (!dst_count) {
192
0
    dst_count = hash_chars(r, dst);
193
0
    if (dst_count_p)
194
0
      *dst_count_p = dst_count;
195
0
  }
196
0
  sc = la = 0;
197
198
0
  s = src_count->data;
199
0
  d = dst_count->data;
200
0
  for (;;) {
201
0
    unsigned dst_cnt, src_cnt;
202
0
    if (!s->cnt)
203
0
      break; /* we checked all in src */
204
0
    while (d->cnt) {
205
0
      if (d->hashval >= s->hashval)
206
0
        break;
207
0
      la += d->cnt;
208
0
      d++;
209
0
    }
210
0
    src_cnt = s->cnt;
211
0
    dst_cnt = 0;
212
0
    if (d->cnt && d->hashval == s->hashval) {
213
0
      dst_cnt = d->cnt;
214
0
      d++;
215
0
    }
216
0
    if (src_cnt < dst_cnt) {
217
0
      la += dst_cnt - src_cnt;
218
0
      sc += src_cnt;
219
0
    }
220
0
    else
221
0
      sc += dst_cnt;
222
0
    s++;
223
0
  }
224
0
  while (d->cnt) {
225
0
    la += d->cnt;
226
0
    d++;
227
0
  }
228
229
0
  if (!src_count_p)
230
0
    free(src_count);
231
0
  if (!dst_count_p)
232
0
    free(dst_count);
233
0
  *src_copied = sc;
234
0
  *literal_added = la;
235
0
  return 0;
236
0
}