Coverage Report

Created: 2024-09-08 06:23

/src/git/bloom.c
Line
Count
Source (jump to first uncovered line)
1
#include "git-compat-util.h"
2
#include "bloom.h"
3
#include "diff.h"
4
#include "diffcore.h"
5
#include "hashmap.h"
6
#include "commit-graph.h"
7
#include "commit.h"
8
#include "commit-slab.h"
9
#include "tree.h"
10
#include "tree-walk.h"
11
#include "config.h"
12
#include "repository.h"
13
14
define_commit_slab(bloom_filter_slab, struct bloom_filter);
15
16
static struct bloom_filter_slab bloom_filters;
17
18
struct pathmap_hash_entry {
19
    struct hashmap_entry entry;
20
    const char path[FLEX_ARRAY];
21
};
22
23
static uint32_t rotate_left(uint32_t value, int32_t count)
24
0
{
25
0
  uint32_t mask = 8 * sizeof(uint32_t) - 1;
26
0
  count &= mask;
27
0
  return ((value << count) | (value >> ((-count) & mask)));
28
0
}
29
30
static inline unsigned char get_bitmask(uint32_t pos)
31
0
{
32
0
  return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
33
0
}
34
35
static int check_bloom_offset(struct commit_graph *g, uint32_t pos,
36
            uint32_t offset)
37
0
{
38
  /*
39
   * Note that we allow offsets equal to the data size, which would set
40
   * our pointers at one past the end of the chunk memory. This is
41
   * necessary because the on-disk index points to the end of the
42
   * entries (so we can compute size by comparing adjacent ones). And
43
   * naturally the final entry's end is one-past-the-end of the chunk.
44
   */
45
0
  if (offset <= g->chunk_bloom_data_size - BLOOMDATA_CHUNK_HEADER_SIZE)
46
0
    return 0;
47
48
0
  warning("ignoring out-of-range offset (%"PRIuMAX") for changed-path"
49
0
    " filter at pos %"PRIuMAX" of %s (chunk size: %"PRIuMAX")",
50
0
    (uintmax_t)offset, (uintmax_t)pos,
51
0
    g->filename, (uintmax_t)g->chunk_bloom_data_size);
52
0
  return -1;
53
0
}
54
55
int load_bloom_filter_from_graph(struct commit_graph *g,
56
         struct bloom_filter *filter,
57
         uint32_t graph_pos)
58
0
{
59
0
  uint32_t lex_pos, start_index, end_index;
60
61
0
  while (graph_pos < g->num_commits_in_base)
62
0
    g = g->base_graph;
63
64
  /* The commit graph commit 'c' lives in doesn't carry Bloom filters. */
65
0
  if (!g->chunk_bloom_indexes)
66
0
    return 0;
67
68
0
  lex_pos = graph_pos - g->num_commits_in_base;
69
70
0
  end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos);
71
72
0
  if (lex_pos > 0)
73
0
    start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1));
74
0
  else
75
0
    start_index = 0;
76
77
0
  if (check_bloom_offset(g, lex_pos, end_index) < 0 ||
78
0
      check_bloom_offset(g, lex_pos - 1, start_index) < 0)
79
0
    return 0;
80
81
0
  if (end_index < start_index) {
82
0
    warning("ignoring decreasing changed-path index offsets"
83
0
      " (%"PRIuMAX" > %"PRIuMAX") for positions"
84
0
      " %"PRIuMAX" and %"PRIuMAX" of %s",
85
0
      (uintmax_t)start_index, (uintmax_t)end_index,
86
0
      (uintmax_t)(lex_pos-1), (uintmax_t)lex_pos,
87
0
      g->filename);
88
0
    return 0;
89
0
  }
90
91
0
  filter->len = end_index - start_index;
92
0
  filter->data = (unsigned char *)(g->chunk_bloom_data +
93
0
          sizeof(unsigned char) * start_index +
94
0
          BLOOMDATA_CHUNK_HEADER_SIZE);
95
0
  filter->version = g->bloom_filter_settings->hash_version;
96
0
  filter->to_free = NULL;
97
98
0
  return 1;
99
0
}
100
101
/*
102
 * Calculate the murmur3 32-bit hash value for the given data
103
 * using the given seed.
104
 * Produces a uniformly distributed hash value.
105
 * Not considered to be cryptographically secure.
106
 * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
107
 */
108
uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len)
109
0
{
110
0
  const uint32_t c1 = 0xcc9e2d51;
111
0
  const uint32_t c2 = 0x1b873593;
112
0
  const uint32_t r1 = 15;
113
0
  const uint32_t r2 = 13;
114
0
  const uint32_t m = 5;
115
0
  const uint32_t n = 0xe6546b64;
116
0
  int i;
117
0
  uint32_t k1 = 0;
118
0
  const char *tail;
119
120
0
  int len4 = len / sizeof(uint32_t);
121
122
0
  uint32_t k;
123
0
  for (i = 0; i < len4; i++) {
124
0
    uint32_t byte1 = (uint32_t)(unsigned char)data[4*i];
125
0
    uint32_t byte2 = ((uint32_t)(unsigned char)data[4*i + 1]) << 8;
126
0
    uint32_t byte3 = ((uint32_t)(unsigned char)data[4*i + 2]) << 16;
127
0
    uint32_t byte4 = ((uint32_t)(unsigned char)data[4*i + 3]) << 24;
128
0
    k = byte1 | byte2 | byte3 | byte4;
129
0
    k *= c1;
130
0
    k = rotate_left(k, r1);
131
0
    k *= c2;
132
133
0
    seed ^= k;
134
0
    seed = rotate_left(seed, r2) * m + n;
135
0
  }
136
137
0
  tail = (data + len4 * sizeof(uint32_t));
138
139
0
  switch (len & (sizeof(uint32_t) - 1)) {
140
0
  case 3:
141
0
    k1 ^= ((uint32_t)(unsigned char)tail[2]) << 16;
142
    /*-fallthrough*/
143
0
  case 2:
144
0
    k1 ^= ((uint32_t)(unsigned char)tail[1]) << 8;
145
    /*-fallthrough*/
146
0
  case 1:
147
0
    k1 ^= ((uint32_t)(unsigned char)tail[0]) << 0;
148
0
    k1 *= c1;
149
0
    k1 = rotate_left(k1, r1);
150
0
    k1 *= c2;
151
0
    seed ^= k1;
152
0
    break;
153
0
  }
154
155
0
  seed ^= (uint32_t)len;
156
0
  seed ^= (seed >> 16);
157
0
  seed *= 0x85ebca6b;
158
0
  seed ^= (seed >> 13);
159
0
  seed *= 0xc2b2ae35;
160
0
  seed ^= (seed >> 16);
161
162
0
  return seed;
163
0
}
164
165
static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len)
166
0
{
167
0
  const uint32_t c1 = 0xcc9e2d51;
168
0
  const uint32_t c2 = 0x1b873593;
169
0
  const uint32_t r1 = 15;
170
0
  const uint32_t r2 = 13;
171
0
  const uint32_t m = 5;
172
0
  const uint32_t n = 0xe6546b64;
173
0
  int i;
174
0
  uint32_t k1 = 0;
175
0
  const char *tail;
176
177
0
  int len4 = len / sizeof(uint32_t);
178
179
0
  uint32_t k;
180
0
  for (i = 0; i < len4; i++) {
181
0
    uint32_t byte1 = (uint32_t)data[4*i];
182
0
    uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8;
183
0
    uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16;
184
0
    uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24;
185
0
    k = byte1 | byte2 | byte3 | byte4;
186
0
    k *= c1;
187
0
    k = rotate_left(k, r1);
188
0
    k *= c2;
189
190
0
    seed ^= k;
191
0
    seed = rotate_left(seed, r2) * m + n;
192
0
  }
193
194
0
  tail = (data + len4 * sizeof(uint32_t));
195
196
0
  switch (len & (sizeof(uint32_t) - 1)) {
197
0
  case 3:
198
0
    k1 ^= ((uint32_t)tail[2]) << 16;
199
    /*-fallthrough*/
200
0
  case 2:
201
0
    k1 ^= ((uint32_t)tail[1]) << 8;
202
    /*-fallthrough*/
203
0
  case 1:
204
0
    k1 ^= ((uint32_t)tail[0]) << 0;
205
0
    k1 *= c1;
206
0
    k1 = rotate_left(k1, r1);
207
0
    k1 *= c2;
208
0
    seed ^= k1;
209
0
    break;
210
0
  }
211
212
0
  seed ^= (uint32_t)len;
213
0
  seed ^= (seed >> 16);
214
0
  seed *= 0x85ebca6b;
215
0
  seed ^= (seed >> 13);
216
0
  seed *= 0xc2b2ae35;
217
0
  seed ^= (seed >> 16);
218
219
0
  return seed;
220
0
}
221
222
void fill_bloom_key(const char *data,
223
        size_t len,
224
        struct bloom_key *key,
225
        const struct bloom_filter_settings *settings)
226
0
{
227
0
  int i;
228
0
  const uint32_t seed0 = 0x293ae76f;
229
0
  const uint32_t seed1 = 0x7e646e2c;
230
0
  uint32_t hash0, hash1;
231
0
  if (settings->hash_version == 2) {
232
0
    hash0 = murmur3_seeded_v2(seed0, data, len);
233
0
    hash1 = murmur3_seeded_v2(seed1, data, len);
234
0
  } else {
235
0
    hash0 = murmur3_seeded_v1(seed0, data, len);
236
0
    hash1 = murmur3_seeded_v1(seed1, data, len);
237
0
  }
238
239
0
  key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
240
0
  for (i = 0; i < settings->num_hashes; i++)
241
0
    key->hashes[i] = hash0 + i * hash1;
242
0
}
243
244
void clear_bloom_key(struct bloom_key *key)
245
0
{
246
0
  FREE_AND_NULL(key->hashes);
247
0
}
248
249
void add_key_to_filter(const struct bloom_key *key,
250
           struct bloom_filter *filter,
251
           const struct bloom_filter_settings *settings)
252
0
{
253
0
  int i;
254
0
  uint64_t mod = filter->len * BITS_PER_WORD;
255
256
0
  for (i = 0; i < settings->num_hashes; i++) {
257
0
    uint64_t hash_mod = key->hashes[i] % mod;
258
0
    uint64_t block_pos = hash_mod / BITS_PER_WORD;
259
260
0
    filter->data[block_pos] |= get_bitmask(hash_mod);
261
0
  }
262
0
}
263
264
void init_bloom_filters(void)
265
0
{
266
0
  init_bloom_filter_slab(&bloom_filters);
267
0
}
268
269
static void free_one_bloom_filter(struct bloom_filter *filter)
270
0
{
271
0
  if (!filter)
272
0
    return;
273
0
  free(filter->to_free);
274
0
}
275
276
void deinit_bloom_filters(void)
277
0
{
278
0
  deep_clear_bloom_filter_slab(&bloom_filters, free_one_bloom_filter);
279
0
}
280
281
static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
282
           const struct hashmap_entry *eptr,
283
           const struct hashmap_entry *entry_or_key,
284
           const void *keydata UNUSED)
285
0
{
286
0
  const struct pathmap_hash_entry *e1, *e2;
287
288
0
  e1 = container_of(eptr, const struct pathmap_hash_entry, entry);
289
0
  e2 = container_of(entry_or_key, const struct pathmap_hash_entry, entry);
290
291
0
  return strcmp(e1->path, e2->path);
292
0
}
293
294
static void init_truncated_large_filter(struct bloom_filter *filter,
295
          int version)
296
0
{
297
0
  filter->data = filter->to_free = xmalloc(1);
298
0
  filter->data[0] = 0xFF;
299
0
  filter->len = 1;
300
0
  filter->version = version;
301
0
}
302
303
0
#define VISITED   (1u<<21)
304
0
#define HIGH_BITS (1u<<22)
305
306
static int has_entries_with_high_bit(struct repository *r, struct tree *t)
307
0
{
308
0
  if (parse_tree(t))
309
0
    return 1;
310
311
0
  if (!(t->object.flags & VISITED)) {
312
0
    struct tree_desc desc;
313
0
    struct name_entry entry;
314
315
0
    init_tree_desc(&desc, &t->object.oid, t->buffer, t->size);
316
0
    while (tree_entry(&desc, &entry)) {
317
0
      size_t i;
318
0
      for (i = 0; i < entry.pathlen; i++) {
319
0
        if (entry.path[i] & 0x80) {
320
0
          t->object.flags |= HIGH_BITS;
321
0
          goto done;
322
0
        }
323
0
      }
324
325
0
      if (S_ISDIR(entry.mode)) {
326
0
        struct tree *sub = lookup_tree(r, &entry.oid);
327
0
        if (sub && has_entries_with_high_bit(r, sub)) {
328
0
          t->object.flags |= HIGH_BITS;
329
0
          goto done;
330
0
        }
331
0
      }
332
333
0
    }
334
335
0
done:
336
0
    t->object.flags |= VISITED;
337
0
  }
338
339
0
  return !!(t->object.flags & HIGH_BITS);
340
0
}
341
342
static int commit_tree_has_high_bit_paths(struct repository *r,
343
            struct commit *c)
344
0
{
345
0
  struct tree *t;
346
0
  if (repo_parse_commit(r, c))
347
0
    return 1;
348
0
  t = repo_get_commit_tree(r, c);
349
0
  if (!t)
350
0
    return 1;
351
0
  return has_entries_with_high_bit(r, t);
352
0
}
353
354
static struct bloom_filter *upgrade_filter(struct repository *r, struct commit *c,
355
             struct bloom_filter *filter,
356
             int hash_version)
357
0
{
358
0
  struct commit_list *p = c->parents;
359
0
  if (commit_tree_has_high_bit_paths(r, c))
360
0
    return NULL;
361
362
0
  if (p && commit_tree_has_high_bit_paths(r, p->item))
363
0
    return NULL;
364
365
0
  filter->version = hash_version;
366
367
0
  return filter;
368
0
}
369
370
struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c)
371
0
{
372
0
  struct bloom_filter *filter;
373
0
  int hash_version;
374
375
0
  filter = get_or_compute_bloom_filter(r, c, 0, NULL, NULL);
376
0
  if (!filter)
377
0
    return NULL;
378
379
0
  prepare_repo_settings(r);
380
0
  hash_version = r->settings.commit_graph_changed_paths_version;
381
382
0
  if (!(hash_version == -1 || hash_version == filter->version))
383
0
    return NULL; /* unusable filter */
384
0
  return filter;
385
0
}
386
387
struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
388
             struct commit *c,
389
             int compute_if_not_present,
390
             const struct bloom_filter_settings *settings,
391
             enum bloom_filter_computed *computed)
392
0
{
393
0
  struct bloom_filter *filter;
394
0
  int i;
395
0
  struct diff_options diffopt;
396
397
0
  if (computed)
398
0
    *computed = BLOOM_NOT_COMPUTED;
399
400
0
  if (!bloom_filters.slab_size)
401
0
    return NULL;
402
403
0
  filter = bloom_filter_slab_at(&bloom_filters, c);
404
405
0
  if (!filter->data) {
406
0
    uint32_t graph_pos;
407
0
    if (repo_find_commit_pos_in_graph(r, c, &graph_pos))
408
0
      load_bloom_filter_from_graph(r->objects->commit_graph,
409
0
                 filter, graph_pos);
410
0
  }
411
412
0
  if (filter->data && filter->len) {
413
0
    struct bloom_filter *upgrade;
414
0
    if (!settings || settings->hash_version == filter->version)
415
0
      return filter;
416
417
    /* version mismatch, see if we can upgrade */
418
0
    if (compute_if_not_present &&
419
0
        git_env_bool("GIT_TEST_UPGRADE_BLOOM_FILTERS", 1)) {
420
0
      upgrade = upgrade_filter(r, c, filter,
421
0
             settings->hash_version);
422
0
      if (upgrade) {
423
0
        if (computed)
424
0
          *computed |= BLOOM_UPGRADED;
425
0
        return upgrade;
426
0
      }
427
0
    }
428
0
  }
429
0
  if (!compute_if_not_present)
430
0
    return NULL;
431
432
0
  repo_diff_setup(r, &diffopt);
433
0
  diffopt.flags.recursive = 1;
434
0
  diffopt.detect_rename = 0;
435
0
  diffopt.max_changes = settings->max_changed_paths;
436
0
  diff_setup_done(&diffopt);
437
438
  /* ensure commit is parsed so we have parent information */
439
0
  repo_parse_commit(r, c);
440
441
0
  if (c->parents)
442
0
    diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt);
443
0
  else
444
0
    diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
445
0
  diffcore_std(&diffopt);
446
447
0
  if (diff_queued_diff.nr <= settings->max_changed_paths) {
448
0
    struct hashmap pathmap = HASHMAP_INIT(pathmap_cmp, NULL);
449
0
    struct pathmap_hash_entry *e;
450
0
    struct hashmap_iter iter;
451
452
0
    for (i = 0; i < diff_queued_diff.nr; i++) {
453
0
      const char *path = diff_queued_diff.queue[i]->two->path;
454
455
      /*
456
       * Add each leading directory of the changed file, i.e. for
457
       * 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so
458
       * the Bloom filter could be used to speed up commands like
459
       * 'git log dir/subdir', too.
460
       *
461
       * Note that directories are added without the trailing '/'.
462
       */
463
0
      do {
464
0
        char *last_slash = strrchr(path, '/');
465
466
0
        FLEX_ALLOC_STR(e, path, path);
467
0
        hashmap_entry_init(&e->entry, strhash(path));
468
469
0
        if (!hashmap_get(&pathmap, &e->entry, NULL))
470
0
          hashmap_add(&pathmap, &e->entry);
471
0
        else
472
0
          free(e);
473
474
0
        if (!last_slash)
475
0
          last_slash = (char*)path;
476
0
        *last_slash = '\0';
477
478
0
      } while (*path);
479
480
0
      diff_free_filepair(diff_queued_diff.queue[i]);
481
0
    }
482
483
0
    if (hashmap_get_size(&pathmap) > settings->max_changed_paths) {
484
0
      init_truncated_large_filter(filter,
485
0
                settings->hash_version);
486
0
      if (computed)
487
0
        *computed |= BLOOM_TRUNC_LARGE;
488
0
      goto cleanup;
489
0
    }
490
491
0
    filter->len = (hashmap_get_size(&pathmap) * settings->bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
492
0
    filter->version = settings->hash_version;
493
0
    if (!filter->len) {
494
0
      if (computed)
495
0
        *computed |= BLOOM_TRUNC_EMPTY;
496
0
      filter->len = 1;
497
0
    }
498
0
    CALLOC_ARRAY(filter->data, filter->len);
499
0
    filter->to_free = filter->data;
500
501
0
    hashmap_for_each_entry(&pathmap, &iter, e, entry) {
502
0
      struct bloom_key key;
503
0
      fill_bloom_key(e->path, strlen(e->path), &key, settings);
504
0
      add_key_to_filter(&key, filter, settings);
505
0
      clear_bloom_key(&key);
506
0
    }
507
508
0
  cleanup:
509
0
    hashmap_clear_and_free(&pathmap, struct pathmap_hash_entry, entry);
510
0
  } else {
511
0
    for (i = 0; i < diff_queued_diff.nr; i++)
512
0
      diff_free_filepair(diff_queued_diff.queue[i]);
513
0
    init_truncated_large_filter(filter, settings->hash_version);
514
515
0
    if (computed)
516
0
      *computed |= BLOOM_TRUNC_LARGE;
517
0
  }
518
519
0
  if (computed)
520
0
    *computed |= BLOOM_COMPUTED;
521
522
0
  free(diff_queued_diff.queue);
523
0
  DIFF_QUEUE_CLEAR(&diff_queued_diff);
524
525
0
  return filter;
526
0
}
527
528
int bloom_filter_contains(const struct bloom_filter *filter,
529
        const struct bloom_key *key,
530
        const struct bloom_filter_settings *settings)
531
0
{
532
0
  int i;
533
0
  uint64_t mod = filter->len * BITS_PER_WORD;
534
535
0
  if (!mod)
536
0
    return -1;
537
538
0
  for (i = 0; i < settings->num_hashes; i++) {
539
0
    uint64_t hash_mod = key->hashes[i] % mod;
540
0
    uint64_t block_pos = hash_mod / BITS_PER_WORD;
541
0
    if (!(filter->data[block_pos] & get_bitmask(hash_mod)))
542
0
      return 0;
543
0
  }
544
545
0
  return 1;
546
0
}