Coverage Report

Created: 2026-01-09 07:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/git/bloom.c
Line
Count
Source
1
#define DISABLE_SIGN_COMPARE_WARNINGS
2
3
#include "git-compat-util.h"
4
#include "bloom.h"
5
#include "diff.h"
6
#include "diffcore.h"
7
#include "hashmap.h"
8
#include "commit-graph.h"
9
#include "commit.h"
10
#include "commit-slab.h"
11
#include "tree.h"
12
#include "tree-walk.h"
13
#include "config.h"
14
#include "repository.h"
15
16
0
define_commit_slab(bloom_filter_slab, struct bloom_filter);
Unexecuted instantiation: bloom.c:init_bloom_filter_slab_with_stride
Unexecuted instantiation: bloom.c:clear_bloom_filter_slab
Unexecuted instantiation: bloom.c:bloom_filter_slab_at_peek
17
0
18
0
static struct bloom_filter_slab bloom_filters;
19
0
20
0
struct pathmap_hash_entry {
21
0
    struct hashmap_entry entry;
22
0
    const char path[FLEX_ARRAY];
23
0
};
24
0
25
0
static uint32_t rotate_left(uint32_t value, int32_t count)
26
0
{
27
0
  uint32_t mask = 8 * sizeof(uint32_t) - 1;
28
0
  count &= mask;
29
0
  return ((value << count) | (value >> ((-count) & mask)));
30
0
}
31
32
static inline unsigned char get_bitmask(uint32_t pos)
33
0
{
34
0
  return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
35
0
}
36
37
static int check_bloom_offset(struct commit_graph *g, uint32_t pos,
38
            uint32_t offset)
39
0
{
40
  /*
41
   * Note that we allow offsets equal to the data size, which would set
42
   * our pointers at one past the end of the chunk memory. This is
43
   * necessary because the on-disk index points to the end of the
44
   * entries (so we can compute size by comparing adjacent ones). And
45
   * naturally the final entry's end is one-past-the-end of the chunk.
46
   */
47
0
  if (offset <= g->chunk_bloom_data_size - BLOOMDATA_CHUNK_HEADER_SIZE)
48
0
    return 0;
49
50
0
  warning("ignoring out-of-range offset (%"PRIuMAX") for changed-path"
51
0
    " filter at pos %"PRIuMAX" of %s (chunk size: %"PRIuMAX")",
52
0
    (uintmax_t)offset, (uintmax_t)pos,
53
0
    g->filename, (uintmax_t)g->chunk_bloom_data_size);
54
0
  return -1;
55
0
}
56
57
int load_bloom_filter_from_graph(struct commit_graph *g,
58
         struct bloom_filter *filter,
59
         uint32_t graph_pos)
60
0
{
61
0
  uint32_t lex_pos, start_index, end_index;
62
63
0
  while (graph_pos < g->num_commits_in_base)
64
0
    g = g->base_graph;
65
66
  /* The commit graph commit 'c' lives in doesn't carry Bloom filters. */
67
0
  if (!g->chunk_bloom_indexes)
68
0
    return 0;
69
70
0
  lex_pos = graph_pos - g->num_commits_in_base;
71
72
0
  end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos);
73
74
0
  if (lex_pos > 0)
75
0
    start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1));
76
0
  else
77
0
    start_index = 0;
78
79
0
  if (check_bloom_offset(g, lex_pos, end_index) < 0 ||
80
0
      check_bloom_offset(g, lex_pos - 1, start_index) < 0)
81
0
    return 0;
82
83
0
  if (end_index < start_index) {
84
0
    warning("ignoring decreasing changed-path index offsets"
85
0
      " (%"PRIuMAX" > %"PRIuMAX") for positions"
86
0
      " %"PRIuMAX" and %"PRIuMAX" of %s",
87
0
      (uintmax_t)start_index, (uintmax_t)end_index,
88
0
      (uintmax_t)(lex_pos-1), (uintmax_t)lex_pos,
89
0
      g->filename);
90
0
    return 0;
91
0
  }
92
93
0
  filter->len = end_index - start_index;
94
0
  filter->data = (unsigned char *)(g->chunk_bloom_data +
95
0
          sizeof(unsigned char) * start_index +
96
0
          BLOOMDATA_CHUNK_HEADER_SIZE);
97
0
  filter->version = g->bloom_filter_settings->hash_version;
98
0
  filter->to_free = NULL;
99
100
0
  return 1;
101
0
}
102
103
/*
104
 * Calculate the murmur3 32-bit hash value for the given data
105
 * using the given seed.
106
 * Produces a uniformly distributed hash value.
107
 * Not considered to be cryptographically secure.
108
 * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
109
 */
110
static uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len)
111
0
{
112
0
  const uint32_t c1 = 0xcc9e2d51;
113
0
  const uint32_t c2 = 0x1b873593;
114
0
  const uint32_t r1 = 15;
115
0
  const uint32_t r2 = 13;
116
0
  const uint32_t m = 5;
117
0
  const uint32_t n = 0xe6546b64;
118
0
  int i;
119
0
  uint32_t k1 = 0;
120
0
  const char *tail;
121
122
0
  int len4 = len / sizeof(uint32_t);
123
124
0
  uint32_t k;
125
0
  for (i = 0; i < len4; i++) {
126
0
    uint32_t byte1 = (uint32_t)(unsigned char)data[4*i];
127
0
    uint32_t byte2 = ((uint32_t)(unsigned char)data[4*i + 1]) << 8;
128
0
    uint32_t byte3 = ((uint32_t)(unsigned char)data[4*i + 2]) << 16;
129
0
    uint32_t byte4 = ((uint32_t)(unsigned char)data[4*i + 3]) << 24;
130
0
    k = byte1 | byte2 | byte3 | byte4;
131
0
    k *= c1;
132
0
    k = rotate_left(k, r1);
133
0
    k *= c2;
134
135
0
    seed ^= k;
136
0
    seed = rotate_left(seed, r2) * m + n;
137
0
  }
138
139
0
  tail = (data + len4 * sizeof(uint32_t));
140
141
0
  switch (len & (sizeof(uint32_t) - 1)) {
142
0
  case 3:
143
0
    k1 ^= ((uint32_t)(unsigned char)tail[2]) << 16;
144
    /*-fallthrough*/
145
0
  case 2:
146
0
    k1 ^= ((uint32_t)(unsigned char)tail[1]) << 8;
147
    /*-fallthrough*/
148
0
  case 1:
149
0
    k1 ^= ((uint32_t)(unsigned char)tail[0]) << 0;
150
0
    k1 *= c1;
151
0
    k1 = rotate_left(k1, r1);
152
0
    k1 *= c2;
153
0
    seed ^= k1;
154
0
    break;
155
0
  }
156
157
0
  seed ^= (uint32_t)len;
158
0
  seed ^= (seed >> 16);
159
0
  seed *= 0x85ebca6b;
160
0
  seed ^= (seed >> 13);
161
0
  seed *= 0xc2b2ae35;
162
0
  seed ^= (seed >> 16);
163
164
0
  return seed;
165
0
}
166
167
static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len)
168
0
{
169
0
  const uint32_t c1 = 0xcc9e2d51;
170
0
  const uint32_t c2 = 0x1b873593;
171
0
  const uint32_t r1 = 15;
172
0
  const uint32_t r2 = 13;
173
0
  const uint32_t m = 5;
174
0
  const uint32_t n = 0xe6546b64;
175
0
  int i;
176
0
  uint32_t k1 = 0;
177
0
  const char *tail;
178
179
0
  int len4 = len / sizeof(uint32_t);
180
181
0
  uint32_t k;
182
0
  for (i = 0; i < len4; i++) {
183
0
    uint32_t byte1 = (uint32_t)data[4*i];
184
0
    uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8;
185
0
    uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16;
186
0
    uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24;
187
0
    k = byte1 | byte2 | byte3 | byte4;
188
0
    k *= c1;
189
0
    k = rotate_left(k, r1);
190
0
    k *= c2;
191
192
0
    seed ^= k;
193
0
    seed = rotate_left(seed, r2) * m + n;
194
0
  }
195
196
0
  tail = (data + len4 * sizeof(uint32_t));
197
198
0
  switch (len & (sizeof(uint32_t) - 1)) {
199
0
  case 3:
200
0
    k1 ^= ((uint32_t)tail[2]) << 16;
201
    /*-fallthrough*/
202
0
  case 2:
203
0
    k1 ^= ((uint32_t)tail[1]) << 8;
204
    /*-fallthrough*/
205
0
  case 1:
206
0
    k1 ^= ((uint32_t)tail[0]) << 0;
207
0
    k1 *= c1;
208
0
    k1 = rotate_left(k1, r1);
209
0
    k1 *= c2;
210
0
    seed ^= k1;
211
0
    break;
212
0
  }
213
214
0
  seed ^= (uint32_t)len;
215
0
  seed ^= (seed >> 16);
216
0
  seed *= 0x85ebca6b;
217
0
  seed ^= (seed >> 13);
218
0
  seed *= 0xc2b2ae35;
219
0
  seed ^= (seed >> 16);
220
221
0
  return seed;
222
0
}
223
224
void bloom_key_fill(struct bloom_key *key, const char *data, size_t len,
225
        const struct bloom_filter_settings *settings)
226
0
{
227
0
  int i;
228
0
  const uint32_t seed0 = 0x293ae76f;
229
0
  const uint32_t seed1 = 0x7e646e2c;
230
0
  uint32_t hash0, hash1;
231
0
  if (settings->hash_version == 2) {
232
0
    hash0 = murmur3_seeded_v2(seed0, data, len);
233
0
    hash1 = murmur3_seeded_v2(seed1, data, len);
234
0
  } else {
235
0
    hash0 = murmur3_seeded_v1(seed0, data, len);
236
0
    hash1 = murmur3_seeded_v1(seed1, data, len);
237
0
  }
238
239
0
  key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
240
0
  for (i = 0; i < settings->num_hashes; i++)
241
0
    key->hashes[i] = hash0 + i * hash1;
242
0
}
243
244
void bloom_key_clear(struct bloom_key *key)
245
0
{
246
0
  FREE_AND_NULL(key->hashes);
247
0
}
248
249
void add_key_to_filter(const struct bloom_key *key,
250
           struct bloom_filter *filter,
251
           const struct bloom_filter_settings *settings)
252
0
{
253
0
  int i;
254
0
  uint64_t mod = filter->len * BITS_PER_WORD;
255
256
0
  for (i = 0; i < settings->num_hashes; i++) {
257
0
    uint64_t hash_mod = key->hashes[i] % mod;
258
0
    uint64_t block_pos = hash_mod / BITS_PER_WORD;
259
260
0
    filter->data[block_pos] |= get_bitmask(hash_mod);
261
0
  }
262
0
}
263
264
void init_bloom_filters(void)
265
0
{
266
0
  init_bloom_filter_slab(&bloom_filters);
267
0
}
268
269
static void free_one_bloom_filter(struct bloom_filter *filter)
270
0
{
271
0
  if (!filter)
272
0
    return;
273
0
  free(filter->to_free);
274
0
}
275
276
void deinit_bloom_filters(void)
277
0
{
278
0
  deep_clear_bloom_filter_slab(&bloom_filters, free_one_bloom_filter);
279
0
}
280
281
struct bloom_keyvec *bloom_keyvec_new(const char *path, size_t len,
282
              const struct bloom_filter_settings *settings)
283
0
{
284
0
  struct bloom_keyvec *vec;
285
0
  const char *p;
286
0
  size_t sz;
287
0
  size_t nr = 1;
288
289
0
  p = path;
290
0
  while (*p) {
291
    /*
292
     * At this point, the path is normalized to use Unix-style
293
     * path separators. This is required due to how the
294
     * changed-path Bloom filters store the paths.
295
     */
296
0
    if (*p == '/')
297
0
      nr++;
298
0
    p++;
299
0
  }
300
301
0
  sz = sizeof(struct bloom_keyvec);
302
0
  sz += nr * sizeof(struct bloom_key);
303
0
  vec = (struct bloom_keyvec *)xcalloc(1, sz);
304
0
  if (!vec)
305
0
    return NULL;
306
0
  vec->count = nr;
307
308
0
  bloom_key_fill(&vec->key[0], path, len, settings);
309
0
  nr = 1;
310
0
  p = path + len - 1;
311
0
  while (p > path) {
312
0
    if (*p == '/') {
313
0
      bloom_key_fill(&vec->key[nr++], path, p - path, settings);
314
0
    }
315
0
    p--;
316
0
  }
317
0
  assert(nr == vec->count);
318
0
  return vec;
319
0
}
320
321
void bloom_keyvec_free(struct bloom_keyvec *vec)
322
0
{
323
0
  if (!vec)
324
0
    return;
325
0
  for (size_t nr = 0; nr < vec->count; nr++)
326
0
    bloom_key_clear(&vec->key[nr]);
327
0
  free(vec);
328
0
}
329
330
static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
331
           const struct hashmap_entry *eptr,
332
           const struct hashmap_entry *entry_or_key,
333
           const void *keydata UNUSED)
334
0
{
335
0
  const struct pathmap_hash_entry *e1, *e2;
336
337
0
  e1 = container_of(eptr, const struct pathmap_hash_entry, entry);
338
0
  e2 = container_of(entry_or_key, const struct pathmap_hash_entry, entry);
339
340
0
  return strcmp(e1->path, e2->path);
341
0
}
342
343
static void init_truncated_large_filter(struct bloom_filter *filter,
344
          int version)
345
0
{
346
0
  filter->data = filter->to_free = xmalloc(1);
347
0
  filter->data[0] = 0xFF;
348
0
  filter->len = 1;
349
0
  filter->version = version;
350
0
}
351
352
0
#define VISITED   (1u<<21)
353
0
#define HIGH_BITS (1u<<22)
354
355
static int has_entries_with_high_bit(struct repository *r, struct tree *t)
356
0
{
357
0
  if (parse_tree(t))
358
0
    return 1;
359
360
0
  if (!(t->object.flags & VISITED)) {
361
0
    struct tree_desc desc;
362
0
    struct name_entry entry;
363
364
0
    init_tree_desc(&desc, &t->object.oid, t->buffer, t->size);
365
0
    while (tree_entry(&desc, &entry)) {
366
0
      size_t i;
367
0
      for (i = 0; i < entry.pathlen; i++) {
368
0
        if (entry.path[i] & 0x80) {
369
0
          t->object.flags |= HIGH_BITS;
370
0
          goto done;
371
0
        }
372
0
      }
373
374
0
      if (S_ISDIR(entry.mode)) {
375
0
        struct tree *sub = lookup_tree(r, &entry.oid);
376
0
        if (sub && has_entries_with_high_bit(r, sub)) {
377
0
          t->object.flags |= HIGH_BITS;
378
0
          goto done;
379
0
        }
380
0
      }
381
382
0
    }
383
384
0
done:
385
0
    t->object.flags |= VISITED;
386
0
  }
387
388
0
  return !!(t->object.flags & HIGH_BITS);
389
0
}
390
391
static int commit_tree_has_high_bit_paths(struct repository *r,
392
            struct commit *c)
393
0
{
394
0
  struct tree *t;
395
0
  if (repo_parse_commit(r, c))
396
0
    return 1;
397
0
  t = repo_get_commit_tree(r, c);
398
0
  if (!t)
399
0
    return 1;
400
0
  return has_entries_with_high_bit(r, t);
401
0
}
402
403
static struct bloom_filter *upgrade_filter(struct repository *r, struct commit *c,
404
             struct bloom_filter *filter,
405
             int hash_version)
406
0
{
407
0
  struct commit_list *p = c->parents;
408
0
  if (commit_tree_has_high_bit_paths(r, c))
409
0
    return NULL;
410
411
0
  if (p && commit_tree_has_high_bit_paths(r, p->item))
412
0
    return NULL;
413
414
0
  filter->version = hash_version;
415
416
0
  return filter;
417
0
}
418
419
struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c)
420
0
{
421
0
  struct bloom_filter *filter;
422
0
  int hash_version;
423
424
0
  filter = get_or_compute_bloom_filter(r, c, 0, NULL, NULL);
425
0
  if (!filter)
426
0
    return NULL;
427
428
0
  prepare_repo_settings(r);
429
0
  hash_version = r->settings.commit_graph_changed_paths_version;
430
431
0
  if (!(hash_version == -1 || hash_version == filter->version))
432
0
    return NULL; /* unusable filter */
433
0
  return filter;
434
0
}
435
436
struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
437
             struct commit *c,
438
             int compute_if_not_present,
439
             const struct bloom_filter_settings *settings,
440
             enum bloom_filter_computed *computed)
441
0
{
442
0
  struct bloom_filter *filter;
443
0
  int i;
444
0
  struct diff_options diffopt;
445
446
0
  if (computed)
447
0
    *computed = BLOOM_NOT_COMPUTED;
448
449
0
  if (!bloom_filters.slab_size)
450
0
    return NULL;
451
452
0
  filter = bloom_filter_slab_at(&bloom_filters, c);
453
454
0
  if (!filter->data) {
455
0
    struct commit_graph *g;
456
0
    uint32_t graph_pos;
457
458
0
    g = repo_find_commit_pos_in_graph(r, c, &graph_pos);
459
0
    if (g)
460
0
      load_bloom_filter_from_graph(g, filter, graph_pos);
461
0
  }
462
463
0
  if (filter->data && filter->len) {
464
0
    struct bloom_filter *upgrade;
465
0
    if (!settings || settings->hash_version == filter->version)
466
0
      return filter;
467
468
    /* version mismatch, see if we can upgrade */
469
0
    if (compute_if_not_present &&
470
0
        git_env_bool("GIT_TEST_UPGRADE_BLOOM_FILTERS", 1)) {
471
0
      upgrade = upgrade_filter(r, c, filter,
472
0
             settings->hash_version);
473
0
      if (upgrade) {
474
0
        if (computed)
475
0
          *computed |= BLOOM_UPGRADED;
476
0
        return upgrade;
477
0
      }
478
0
    }
479
0
  }
480
0
  if (!compute_if_not_present)
481
0
    return NULL;
482
483
0
  repo_diff_setup(r, &diffopt);
484
0
  diffopt.flags.recursive = 1;
485
0
  diffopt.detect_rename = 0;
486
0
  diffopt.max_changes = settings->max_changed_paths;
487
0
  diff_setup_done(&diffopt);
488
489
  /* ensure commit is parsed so we have parent information */
490
0
  repo_parse_commit(r, c);
491
492
0
  if (c->parents)
493
0
    diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt);
494
0
  else
495
0
    diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
496
0
  diffcore_std(&diffopt);
497
498
0
  if (diff_queued_diff.nr <= settings->max_changed_paths) {
499
0
    struct hashmap pathmap = HASHMAP_INIT(pathmap_cmp, NULL);
500
0
    struct pathmap_hash_entry *e;
501
0
    struct hashmap_iter iter;
502
503
0
    for (i = 0; i < diff_queued_diff.nr; i++) {
504
0
      const char *path = diff_queued_diff.queue[i]->two->path;
505
506
      /*
507
       * Add each leading directory of the changed file, i.e. for
508
       * 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so
509
       * the Bloom filter could be used to speed up commands like
510
       * 'git log dir/subdir', too.
511
       *
512
       * Note that directories are added without the trailing '/'.
513
       */
514
0
      do {
515
0
        char *last_slash = strrchr(path, '/');
516
517
0
        FLEX_ALLOC_STR(e, path, path);
518
0
        hashmap_entry_init(&e->entry, strhash(path));
519
520
0
        if (!hashmap_get(&pathmap, &e->entry, NULL))
521
0
          hashmap_add(&pathmap, &e->entry);
522
0
        else
523
0
          free(e);
524
525
0
        if (!last_slash)
526
0
          last_slash = (char*)path;
527
0
        *last_slash = '\0';
528
529
0
      } while (*path);
530
0
    }
531
532
0
    if (hashmap_get_size(&pathmap) > settings->max_changed_paths) {
533
0
      init_truncated_large_filter(filter,
534
0
                settings->hash_version);
535
0
      if (computed)
536
0
        *computed |= BLOOM_TRUNC_LARGE;
537
0
      goto cleanup;
538
0
    }
539
540
0
    filter->len = (hashmap_get_size(&pathmap) * settings->bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
541
0
    filter->version = settings->hash_version;
542
0
    if (!filter->len) {
543
0
      if (computed)
544
0
        *computed |= BLOOM_TRUNC_EMPTY;
545
0
      filter->len = 1;
546
0
    }
547
0
    CALLOC_ARRAY(filter->data, filter->len);
548
0
    filter->to_free = filter->data;
549
550
0
    hashmap_for_each_entry(&pathmap, &iter, e, entry) {
551
0
      struct bloom_key key;
552
0
      bloom_key_fill(&key, e->path, strlen(e->path), settings);
553
0
      add_key_to_filter(&key, filter, settings);
554
0
      bloom_key_clear(&key);
555
0
    }
556
557
0
  cleanup:
558
0
    hashmap_clear_and_free(&pathmap, struct pathmap_hash_entry, entry);
559
0
  } else {
560
0
    init_truncated_large_filter(filter, settings->hash_version);
561
562
0
    if (computed)
563
0
      *computed |= BLOOM_TRUNC_LARGE;
564
0
  }
565
566
0
  if (computed)
567
0
    *computed |= BLOOM_COMPUTED;
568
569
0
  diff_queue_clear(&diff_queued_diff);
570
0
  return filter;
571
0
}
572
573
int bloom_filter_contains(const struct bloom_filter *filter,
574
        const struct bloom_key *key,
575
        const struct bloom_filter_settings *settings)
576
0
{
577
0
  int i;
578
0
  uint64_t mod = filter->len * BITS_PER_WORD;
579
580
0
  if (!mod)
581
0
    return -1;
582
583
0
  for (i = 0; i < settings->num_hashes; i++) {
584
0
    uint64_t hash_mod = key->hashes[i] % mod;
585
0
    uint64_t block_pos = hash_mod / BITS_PER_WORD;
586
0
    if (!(filter->data[block_pos] & get_bitmask(hash_mod)))
587
0
      return 0;
588
0
  }
589
590
0
  return 1;
591
0
}
592
593
int bloom_filter_contains_vec(const struct bloom_filter *filter,
594
            const struct bloom_keyvec *vec,
595
            const struct bloom_filter_settings *settings)
596
0
{
597
0
  int ret = 1;
598
599
0
  for (size_t nr = 0; ret > 0 && nr < vec->count; nr++)
600
0
    ret = bloom_filter_contains(filter, &vec->key[nr], settings);
601
602
0
  return ret;
603
0
}
604
605
uint32_t test_bloom_murmur3_seeded(uint32_t seed, const char *data, size_t len,
606
           int version)
607
0
{
608
0
  assert(version == 1 || version == 2);
609
610
0
  if (version == 2)
611
0
    return murmur3_seeded_v2(seed, data, len);
612
0
  else
613
0
    return murmur3_seeded_v1(seed, data, len);
614
0
}