/src/git/bloom.c

Source
#define DISABLE_SIGN_COMPARE_WARNINGS

#include "git-compat-util.h"
#include "bloom.h"
#include "diff.h"
#include "diffcore.h"
#include "hashmap.h"
#include "commit-graph.h"
#include "commit.h"
#include "commit-slab.h"
#include "tree.h"
#include "tree-walk.h"
#include "config.h"
#include "repository.h"

define_commit_slab(bloom_filter_slab, struct bloom_filter);

static struct bloom_filter_slab bloom_filters;

struct pathmap_hash_entry {
    struct hashmap_entry entry;
    const char path[FLEX_ARRAY];
};

static uint32_t rotate_left(uint32_t value, int32_t count)
{
  uint32_t mask = 8 * sizeof(uint32_t) - 1;
  count &= mask;
  return ((value << count) | (value >> ((-count) & mask)));
}

static inline unsigned char get_bitmask(uint32_t pos)
{
  return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
}

static int check_bloom_offset(struct commit_graph *g, uint32_t pos,
            uint32_t offset)
{
  /*
   * Note that we allow offsets equal to the data size, which would set
   * our pointers at one past the end of the chunk memory. This is
   * necessary because the on-disk index points to the end of the
   * entries (so we can compute size by comparing adjacent ones). And
   * naturally the final entry's end is one-past-the-end of the chunk.
   */
  if (offset <= g->chunk_bloom_data_size - BLOOMDATA_CHUNK_HEADER_SIZE)
    return 0;

  warning("ignoring out-of-range offset (%"PRIuMAX") for changed-path"
    " filter at pos %"PRIuMAX" of %s (chunk size: %"PRIuMAX")",
    (uintmax_t)offset, (uintmax_t)pos,
    g->filename, (uintmax_t)g->chunk_bloom_data_size);
  return -1;
}

int load_bloom_filter_from_graph(struct commit_graph *g,
         struct bloom_filter *filter,
         uint32_t graph_pos)
{
  uint32_t lex_pos, start_index, end_index;

  while (graph_pos < g->num_commits_in_base)
    g = g->base_graph;

  /* The commit graph commit 'c' lives in doesn't carry Bloom filters. */
  if (!g->chunk_bloom_indexes)
    return 0;

  lex_pos = graph_pos - g->num_commits_in_base;

  end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos);

  if (lex_pos > 0)
    start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1));
  else
    start_index = 0;

  if (check_bloom_offset(g, lex_pos, end_index) < 0 ||
      check_bloom_offset(g, lex_pos - 1, start_index) < 0)
    return 0;

  if (end_index < start_index) {
    warning("ignoring decreasing changed-path index offsets"
      " (%"PRIuMAX" > %"PRIuMAX") for positions"
      " %"PRIuMAX" and %"PRIuMAX" of %s",
      (uintmax_t)start_index, (uintmax_t)end_index,
      (uintmax_t)(lex_pos-1), (uintmax_t)lex_pos,
      g->filename);
    return 0;
  }

  filter->len = end_index - start_index;
  filter->data = (unsigned char *)(g->chunk_bloom_data +
          sizeof(unsigned char) * start_index +
          BLOOMDATA_CHUNK_HEADER_SIZE);
  filter->version = g->bloom_filter_settings->hash_version;
  filter->to_free = NULL;

  return 1;
}

/*
 * Calculate the murmur3 32-bit hash value for the given data
 * using the given seed.
 * Produces a uniformly distributed hash value.
 * Not considered to be cryptographically secure.
 * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
 */
static uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len)
{
  const uint32_t c1 = 0xcc9e2d51;
  const uint32_t c2 = 0x1b873593;
  const uint32_t r1 = 15;
  const uint32_t r2 = 13;
  const uint32_t m = 5;
  const uint32_t n = 0xe6546b64;
  int i;
  uint32_t k1 = 0;
  const char *tail;

  int len4 = len / sizeof(uint32_t);

  uint32_t k;
  for (i = 0; i < len4; i++) {
    uint32_t byte1 = (uint32_t)(unsigned char)data[4*i];
    uint32_t byte2 = ((uint32_t)(unsigned char)data[4*i + 1]) << 8;
    uint32_t byte3 = ((uint32_t)(unsigned char)data[4*i + 2]) << 16;
    uint32_t byte4 = ((uint32_t)(unsigned char)data[4*i + 3]) << 24;
    k = byte1 | byte2 | byte3 | byte4;
    k *= c1;
    k = rotate_left(k, r1);
    k *= c2;

    seed ^= k;
    seed = rotate_left(seed, r2) * m + n;
  }

  tail = (data + len4 * sizeof(uint32_t));

  switch (len & (sizeof(uint32_t) - 1)) {
  case 3:
    k1 ^= ((uint32_t)(unsigned char)tail[2]) << 16;
    /*-fallthrough*/
  case 2:
    k1 ^= ((uint32_t)(unsigned char)tail[1]) << 8;
    /*-fallthrough*/
  case 1:
    k1 ^= ((uint32_t)(unsigned char)tail[0]) << 0;
    k1 *= c1;
    k1 = rotate_left(k1, r1);
    k1 *= c2;
    seed ^= k1;
    break;
  }

  seed ^= (uint32_t)len;
  seed ^= (seed >> 16);
  seed *= 0x85ebca6b;
  seed ^= (seed >> 13);
  seed *= 0xc2b2ae35;
  seed ^= (seed >> 16);

  return seed;
}

static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len)
{
  const uint32_t c1 = 0xcc9e2d51;
  const uint32_t c2 = 0x1b873593;
  const uint32_t r1 = 15;
  const uint32_t r2 = 13;
  const uint32_t m = 5;
  const uint32_t n = 0xe6546b64;
  int i;
  uint32_t k1 = 0;
  const char *tail;

  int len4 = len / sizeof(uint32_t);

  uint32_t k;
  for (i = 0; i < len4; i++) {
    uint32_t byte1 = (uint32_t)data[4*i];
    uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8;
    uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16;
    uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24;
    k = byte1 | byte2 | byte3 | byte4;
    k *= c1;
    k = rotate_left(k, r1);
    k *= c2;

    seed ^= k;
    seed = rotate_left(seed, r2) * m + n;
  }

  tail = (data + len4 * sizeof(uint32_t));

  switch (len & (sizeof(uint32_t) - 1)) {
  case 3:
    k1 ^= ((uint32_t)tail[2]) << 16;
    /*-fallthrough*/
  case 2:
    k1 ^= ((uint32_t)tail[1]) << 8;
    /*-fallthrough*/
  case 1:
    k1 ^= ((uint32_t)tail[0]) << 0;
    k1 *= c1;
    k1 = rotate_left(k1, r1);
    k1 *= c2;
    seed ^= k1;
    break;
  }

  seed ^= (uint32_t)len;
  seed ^= (seed >> 16);
  seed *= 0x85ebca6b;
  seed ^= (seed >> 13);
  seed *= 0xc2b2ae35;
  seed ^= (seed >> 16);

  return seed;
}

void bloom_key_fill(struct bloom_key *key, const char *data, size_t len,
        const struct bloom_filter_settings *settings)
{
  int i;
  const uint32_t seed0 = 0x293ae76f;
  const uint32_t seed1 = 0x7e646e2c;
  uint32_t hash0, hash1;
  if (settings->hash_version == 2) {
    hash0 = murmur3_seeded_v2(seed0, data, len);
    hash1 = murmur3_seeded_v2(seed1, data, len);
  } else {
    hash0 = murmur3_seeded_v1(seed0, data, len);
    hash1 = murmur3_seeded_v1(seed1, data, len);
  }

  key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
  for (i = 0; i < settings->num_hashes; i++)
    key->hashes[i] = hash0 + i * hash1;
}

void bloom_key_clear(struct bloom_key *key)
{
  FREE_AND_NULL(key->hashes);
}

void add_key_to_filter(const struct bloom_key *key,
           struct bloom_filter *filter,
           const struct bloom_filter_settings *settings)
{
  int i;
  uint64_t mod = filter->len * BITS_PER_WORD;

  for (i = 0; i < settings->num_hashes; i++) {
    uint64_t hash_mod = key->hashes[i] % mod;
    uint64_t block_pos = hash_mod / BITS_PER_WORD;

    filter->data[block_pos] |= get_bitmask(hash_mod);
  }
}

void init_bloom_filters(void)
{
  init_bloom_filter_slab(&bloom_filters);
}

static void free_one_bloom_filter(struct bloom_filter *filter)
{
  if (!filter)
    return;
  free(filter->to_free);
}

void deinit_bloom_filters(void)
{
  deep_clear_bloom_filter_slab(&bloom_filters, free_one_bloom_filter);
}

struct bloom_keyvec *bloom_keyvec_new(const char *path, size_t len,
              const struct bloom_filter_settings *settings)
{
  struct bloom_keyvec *vec;
  const char *p;
  size_t sz;
  size_t nr = 1;

  p = path;
  while (*p) {
    /*
     * At this point, the path is normalized to use Unix-style
     * path separators. This is required due to how the
     * changed-path Bloom filters store the paths.
     */
    if (*p == '/')
      nr++;
    p++;
  }

  sz = sizeof(struct bloom_keyvec);
  sz += nr * sizeof(struct bloom_key);
  vec = (struct bloom_keyvec *)xcalloc(1, sz);
  if (!vec)
    return NULL;
  vec->count = nr;

  bloom_key_fill(&vec->key[0], path, len, settings);
  nr = 1;
  p = path + len - 1;
  while (p > path) {
    if (*p == '/') {
      bloom_key_fill(&vec->key[nr++], path, p - path, settings);
    }
    p--;
  }
  assert(nr == vec->count);
  return vec;
}

void bloom_keyvec_free(struct bloom_keyvec *vec)
{
  if (!vec)
    return;
  for (size_t nr = 0; nr < vec->count; nr++)
    bloom_key_clear(&vec->key[nr]);
  free(vec);
}

static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
           const struct hashmap_entry *eptr,
           const struct hashmap_entry *entry_or_key,
           const void *keydata UNUSED)
{
  const struct pathmap_hash_entry *e1, *e2;

  e1 = container_of(eptr, const struct pathmap_hash_entry, entry);
  e2 = container_of(entry_or_key, const struct pathmap_hash_entry, entry);

  return strcmp(e1->path, e2->path);
}

static void init_truncated_large_filter(struct bloom_filter *filter,
          int version)
{
  filter->data = filter->to_free = xmalloc(1);
  filter->data[0] = 0xFF;
  filter->len = 1;
  filter->version = version;
}

#define VISITED   (1u<<21)
#define HIGH_BITS (1u<<22)

static int has_entries_with_high_bit(struct repository *r, struct tree *t)
{
  if (parse_tree(t))
    return 1;

  if (!(t->object.flags & VISITED)) {
    struct tree_desc desc;
    struct name_entry entry;

    init_tree_desc(&desc, &t->object.oid, t->buffer, t->size);
    while (tree_entry(&desc, &entry)) {
      size_t i;
      for (i = 0; i < entry.pathlen; i++) {
        if (entry.path[i] & 0x80) {
          t->object.flags |= HIGH_BITS;
          goto done;
        }
      }

      if (S_ISDIR(entry.mode)) {
        struct tree *sub = lookup_tree(r, &entry.oid);
        if (sub && has_entries_with_high_bit(r, sub)) {
          t->object.flags |= HIGH_BITS;
          goto done;
        }
      }

    }

done:
    t->object.flags |= VISITED;
  }

  return !!(t->object.flags & HIGH_BITS);
}

static int commit_tree_has_high_bit_paths(struct repository *r,
            struct commit *c)
{
  struct tree *t;
  if (repo_parse_commit(r, c))
    return 1;
  t = repo_get_commit_tree(r, c);
  if (!t)
    return 1;
  return has_entries_with_high_bit(r, t);
}

static struct bloom_filter *upgrade_filter(struct repository *r, struct commit *c,
             struct bloom_filter *filter,
             int hash_version)
{
  struct commit_list *p = c->parents;
  if (commit_tree_has_high_bit_paths(r, c))
    return NULL;

  if (p && commit_tree_has_high_bit_paths(r, p->item))
    return NULL;

  filter->version = hash_version;

  return filter;
}

struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c)
{
  struct bloom_filter *filter;
  int hash_version;

  filter = get_or_compute_bloom_filter(r, c, 0, NULL, NULL);
  if (!filter)
    return NULL;

  prepare_repo_settings(r);
  hash_version = r->settings.commit_graph_changed_paths_version;

  if (!(hash_version == -1 || hash_version == filter->version))
    return NULL; /* unusable filter */
  return filter;
}

struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
             struct commit *c,
             int compute_if_not_present,
             const struct bloom_filter_settings *settings,
             enum bloom_filter_computed *computed)
{
  struct bloom_filter *filter;
  int i;
  struct diff_options diffopt;

  if (computed)
    *computed = BLOOM_NOT_COMPUTED;

  if (!bloom_filters.slab_size)
    return NULL;

  filter = bloom_filter_slab_at(&bloom_filters, c);

  if (!filter->data) {
    struct commit_graph *g;
    uint32_t graph_pos;

    g = repo_find_commit_pos_in_graph(r, c, &graph_pos);
    if (g)
      load_bloom_filter_from_graph(g, filter, graph_pos);
  }

  if (filter->data && filter->len) {
    struct bloom_filter *upgrade;
    if (!settings || settings->hash_version == filter->version)
      return filter;

    /* version mismatch, see if we can upgrade */
    if (compute_if_not_present &&
        git_env_bool("GIT_TEST_UPGRADE_BLOOM_FILTERS", 1)) {
      upgrade = upgrade_filter(r, c, filter,
             settings->hash_version);
      if (upgrade) {
        if (computed)
          *computed |= BLOOM_UPGRADED;
        return upgrade;
      }
    }
  }
  if (!compute_if_not_present)
    return NULL;

  repo_diff_setup(r, &diffopt);
  diffopt.flags.recursive = 1;
  diffopt.detect_rename = 0;
  diffopt.max_changes = settings->max_changed_paths;
  diff_setup_done(&diffopt);

  /* ensure commit is parsed so we have parent information */
  repo_parse_commit(r, c);

  if (c->parents)
    diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt);
  else
    diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
  diffcore_std(&diffopt);

  if (diff_queued_diff.nr <= settings->max_changed_paths) {
    struct hashmap pathmap = HASHMAP_INIT(pathmap_cmp, NULL);
    struct pathmap_hash_entry *e;
    struct hashmap_iter iter;

    for (i = 0; i < diff_queued_diff.nr; i++) {
      const char *path = diff_queued_diff.queue[i]->two->path;

      /*
       * Add each leading directory of the changed file, i.e. for
       * 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so
       * the Bloom filter could be used to speed up commands like
       * 'git log dir/subdir', too.
       *
       * Note that directories are added without the trailing '/'.
       */
      do {
        char *last_slash = strrchr(path, '/');

        FLEX_ALLOC_STR(e, path, path);
        hashmap_entry_init(&e->entry, strhash(path));

        if (!hashmap_get(&pathmap, &e->entry, NULL))
          hashmap_add(&pathmap, &e->entry);
        else
          free(e);

        if (!last_slash)
          last_slash = (char*)path;
        *last_slash = '\0';

      } while (*path);
    }

    if (hashmap_get_size(&pathmap) > settings->max_changed_paths) {
      init_truncated_large_filter(filter,
                settings->hash_version);
      if (computed)
        *computed |= BLOOM_TRUNC_LARGE;
      goto cleanup;
    }

    filter->len = (hashmap_get_size(&pathmap) * settings->bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
    filter->version = settings->hash_version;
    if (!filter->len) {
      if (computed)
        *computed |= BLOOM_TRUNC_EMPTY;
      filter->len = 1;
    }
    CALLOC_ARRAY(filter->data, filter->len);
    filter->to_free = filter->data;

    hashmap_for_each_entry(&pathmap, &iter, e, entry) {
      struct bloom_key key;
      bloom_key_fill(&key, e->path, strlen(e->path), settings);
      add_key_to_filter(&key, filter, settings);
      bloom_key_clear(&key);
    }

  cleanup:
    hashmap_clear_and_free(&pathmap, struct pathmap_hash_entry, entry);
  } else {
    init_truncated_large_filter(filter, settings->hash_version);

    if (computed)
      *computed |= BLOOM_TRUNC_LARGE;
  }

  if (computed)
    *computed |= BLOOM_COMPUTED;

  diff_queue_clear(&diff_queued_diff);
  return filter;
}

int bloom_filter_contains(const struct bloom_filter *filter,
        const struct bloom_key *key,
        const struct bloom_filter_settings *settings)
{
  int i;
  uint64_t mod = filter->len * BITS_PER_WORD;

  if (!mod)
    return -1;

  for (i = 0; i < settings->num_hashes; i++) {
    uint64_t hash_mod = key->hashes[i] % mod;
    uint64_t block_pos = hash_mod / BITS_PER_WORD;
    if (!(filter->data[block_pos] & get_bitmask(hash_mod)))
      return 0;
  }

  return 1;
}

int bloom_filter_contains_vec(const struct bloom_filter *filter,
            const struct bloom_keyvec *vec,
            const struct bloom_filter_settings *settings)
{
  int ret = 1;

  for (size_t nr = 0; ret > 0 && nr < vec->count; nr++)
    ret = bloom_filter_contains(filter, &vec->key[nr], settings);

  return ret;
}

uint32_t test_bloom_murmur3_seeded(uint32_t seed, const char *data, size_t len,
           int version)
{
  assert(version == 1 || version == 2);

  if (version == 2)
    return murmur3_seeded_v2(seed, data, len);
  else
    return murmur3_seeded_v1(seed, data, len);
}

Coverage Report

Created: 2026-01-09 07:10

Line	Count	Source
1		#define DISABLE_SIGN_COMPARE_WARNINGS
2
3		#include "git-compat-util.h"
4		#include "bloom.h"
5		#include "diff.h"
6		#include "diffcore.h"
7		#include "hashmap.h"
8		#include "commit-graph.h"
9		#include "commit.h"
10		#include "commit-slab.h"
11		#include "tree.h"
12		#include "tree-walk.h"
13		#include "config.h"
14		#include "repository.h"
15
16	0	define_commit_slab(bloom_filter_slab, struct bloom_filter); Unexecuted instantiation: bloom.c:init_bloom_filter_slab_with_stride Unexecuted instantiation: bloom.c:clear_bloom_filter_slab Unexecuted instantiation: bloom.c:bloom_filter_slab_at_peek
17	0
18	0	static struct bloom_filter_slab bloom_filters;
19	0
20	0	struct pathmap_hash_entry {
21	0	struct hashmap_entry entry;
22	0	const char path[FLEX_ARRAY];
23	0	};
24	0
25	0	static uint32_t rotate_left(uint32_t value, int32_t count)
26	0	{
27	0	uint32_t mask = 8 * sizeof(uint32_t) - 1;
28	0	count &= mask;
29	0	return ((value << count) \| (value >> ((-count) & mask)));
30	0	}
31
32		static inline unsigned char get_bitmask(uint32_t pos)
33	0	{
34	0	return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
35	0	}
36
37		static int check_bloom_offset(struct commit_graph *g, uint32_t pos,
38		uint32_t offset)
39	0	{
40		/*
41		* Note that we allow offsets equal to the data size, which would set
42		* our pointers at one past the end of the chunk memory. This is
43		* necessary because the on-disk index points to the end of the
44		* entries (so we can compute size by comparing adjacent ones). And
45		* naturally the final entry's end is one-past-the-end of the chunk.
46		*/
47	0	if (offset <= g->chunk_bloom_data_size - BLOOMDATA_CHUNK_HEADER_SIZE)
48	0	return 0;
49
50	0	warning("ignoring out-of-range offset (%"PRIuMAX") for changed-path"
51	0	" filter at pos %"PRIuMAX" of %s (chunk size: %"PRIuMAX")",
52	0	(uintmax_t)offset, (uintmax_t)pos,
53	0	g->filename, (uintmax_t)g->chunk_bloom_data_size);
54	0	return -1;
55	0	}
56
57		int load_bloom_filter_from_graph(struct commit_graph *g,
58		struct bloom_filter *filter,
59		uint32_t graph_pos)
60	0	{
61	0	uint32_t lex_pos, start_index, end_index;
62
63	0	while (graph_pos < g->num_commits_in_base)
64	0	g = g->base_graph;
65
66		/* The commit graph commit 'c' lives in doesn't carry Bloom filters. */
67	0	if (!g->chunk_bloom_indexes)
68	0	return 0;
69
70	0	lex_pos = graph_pos - g->num_commits_in_base;
71
72	0	end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos);
73
74	0	if (lex_pos > 0)
75	0	start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1));
76	0	else
77	0	start_index = 0;
78
79	0	if (check_bloom_offset(g, lex_pos, end_index) < 0 \|\|
80	0	check_bloom_offset(g, lex_pos - 1, start_index) < 0)
81	0	return 0;
82
83	0	if (end_index < start_index) {
84	0	warning("ignoring decreasing changed-path index offsets"
85	0	" (%"PRIuMAX" > %"PRIuMAX") for positions"
86	0	" %"PRIuMAX" and %"PRIuMAX" of %s",
87	0	(uintmax_t)start_index, (uintmax_t)end_index,
88	0	(uintmax_t)(lex_pos-1), (uintmax_t)lex_pos,
89	0	g->filename);
90	0	return 0;
91	0	}
92
93	0	filter->len = end_index - start_index;
94	0	filter->data = (unsigned char *)(g->chunk_bloom_data +
95	0	sizeof(unsigned char) * start_index +
96	0	BLOOMDATA_CHUNK_HEADER_SIZE);
97	0	filter->version = g->bloom_filter_settings->hash_version;
98	0	filter->to_free = NULL;
99
100	0	return 1;
101	0	}
102
103		/*
104		* Calculate the murmur3 32-bit hash value for the given data
105		* using the given seed.
106		* Produces a uniformly distributed hash value.
107		* Not considered to be cryptographically secure.
108		* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
109		*/
110		static uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len)
111	0	{
112	0	const uint32_t c1 = 0xcc9e2d51;
113	0	const uint32_t c2 = 0x1b873593;
114	0	const uint32_t r1 = 15;
115	0	const uint32_t r2 = 13;
116	0	const uint32_t m = 5;
117	0	const uint32_t n = 0xe6546b64;
118	0	int i;
119	0	uint32_t k1 = 0;
120	0	const char *tail;
121
122	0	int len4 = len / sizeof(uint32_t);
123
124	0	uint32_t k;
125	0	for (i = 0; i < len4; i++) {
126	0	uint32_t byte1 = (uint32_t)(unsigned char)data[4*i];
127	0	uint32_t byte2 = ((uint32_t)(unsigned char)data[4*i + 1]) << 8;
128	0	uint32_t byte3 = ((uint32_t)(unsigned char)data[4*i + 2]) << 16;
129	0	uint32_t byte4 = ((uint32_t)(unsigned char)data[4*i + 3]) << 24;
130	0	k = byte1 \| byte2 \| byte3 \| byte4;
131	0	k *= c1;
132	0	k = rotate_left(k, r1);
133	0	k *= c2;
134
135	0	seed ^= k;
136	0	seed = rotate_left(seed, r2) * m + n;
137	0	}
138
139	0	tail = (data + len4 * sizeof(uint32_t));
140
141	0	switch (len & (sizeof(uint32_t) - 1)) {
142	0	case 3:
143	0	k1 ^= ((uint32_t)(unsigned char)tail[2]) << 16;
144		/-fallthrough/
145	0	case 2:
146	0	k1 ^= ((uint32_t)(unsigned char)tail[1]) << 8;
147		/-fallthrough/
148	0	case 1:
149	0	k1 ^= ((uint32_t)(unsigned char)tail[0]) << 0;
150	0	k1 *= c1;
151	0	k1 = rotate_left(k1, r1);
152	0	k1 *= c2;
153	0	seed ^= k1;
154	0	break;
155	0	}
156
157	0	seed ^= (uint32_t)len;
158	0	seed ^= (seed >> 16);
159	0	seed *= 0x85ebca6b;
160	0	seed ^= (seed >> 13);
161	0	seed *= 0xc2b2ae35;
162	0	seed ^= (seed >> 16);
163
164	0	return seed;
165	0	}
166
167		static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len)
168	0	{
169	0	const uint32_t c1 = 0xcc9e2d51;
170	0	const uint32_t c2 = 0x1b873593;
171	0	const uint32_t r1 = 15;
172	0	const uint32_t r2 = 13;
173	0	const uint32_t m = 5;
174	0	const uint32_t n = 0xe6546b64;
175	0	int i;
176	0	uint32_t k1 = 0;
177	0	const char *tail;
178
179	0	int len4 = len / sizeof(uint32_t);
180
181	0	uint32_t k;
182	0	for (i = 0; i < len4; i++) {
183	0	uint32_t byte1 = (uint32_t)data[4*i];
184	0	uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8;
185	0	uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16;
186	0	uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24;
187	0	k = byte1 \| byte2 \| byte3 \| byte4;
188	0	k *= c1;
189	0	k = rotate_left(k, r1);
190	0	k *= c2;
191
192	0	seed ^= k;
193	0	seed = rotate_left(seed, r2) * m + n;
194	0	}
195
196	0	tail = (data + len4 * sizeof(uint32_t));
197
198	0	switch (len & (sizeof(uint32_t) - 1)) {
199	0	case 3:
200	0	k1 ^= ((uint32_t)tail[2]) << 16;
201		/-fallthrough/
202	0	case 2:
203	0	k1 ^= ((uint32_t)tail[1]) << 8;
204		/-fallthrough/
205	0	case 1:
206	0	k1 ^= ((uint32_t)tail[0]) << 0;
207	0	k1 *= c1;
208	0	k1 = rotate_left(k1, r1);
209	0	k1 *= c2;
210	0	seed ^= k1;
211	0	break;
212	0	}
213
214	0	seed ^= (uint32_t)len;
215	0	seed ^= (seed >> 16);
216	0	seed *= 0x85ebca6b;
217	0	seed ^= (seed >> 13);
218	0	seed *= 0xc2b2ae35;
219	0	seed ^= (seed >> 16);
220
221	0	return seed;
222	0	}
223
224		void bloom_key_fill(struct bloom_key key, const char data, size_t len,
225		const struct bloom_filter_settings *settings)
226	0	{
227	0	int i;
228	0	const uint32_t seed0 = 0x293ae76f;
229	0	const uint32_t seed1 = 0x7e646e2c;
230	0	uint32_t hash0, hash1;
231	0	if (settings->hash_version == 2) {
232	0	hash0 = murmur3_seeded_v2(seed0, data, len);
233	0	hash1 = murmur3_seeded_v2(seed1, data, len);
234	0	} else {
235	0	hash0 = murmur3_seeded_v1(seed0, data, len);
236	0	hash1 = murmur3_seeded_v1(seed1, data, len);
237	0	}
238
239	0	key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
240	0	for (i = 0; i < settings->num_hashes; i++)
241	0	key->hashes[i] = hash0 + i * hash1;
242	0	}
243
244		void bloom_key_clear(struct bloom_key *key)
245	0	{
246	0	FREE_AND_NULL(key->hashes);
247	0	}
248
249		void add_key_to_filter(const struct bloom_key *key,
250		struct bloom_filter *filter,
251		const struct bloom_filter_settings *settings)
252	0	{
253	0	int i;
254	0	uint64_t mod = filter->len * BITS_PER_WORD;
255
256	0	for (i = 0; i < settings->num_hashes; i++) {
257	0	uint64_t hash_mod = key->hashes[i] % mod;
258	0	uint64_t block_pos = hash_mod / BITS_PER_WORD;
259
260	0	filter->data[block_pos] \|= get_bitmask(hash_mod);
261	0	}
262	0	}
263
264		void init_bloom_filters(void)
265	0	{
266	0	init_bloom_filter_slab(&bloom_filters);
267	0	}
268
269		static void free_one_bloom_filter(struct bloom_filter *filter)
270	0	{
271	0	if (!filter)
272	0	return;
273	0	free(filter->to_free);
274	0	}
275
276		void deinit_bloom_filters(void)
277	0	{
278	0	deep_clear_bloom_filter_slab(&bloom_filters, free_one_bloom_filter);
279	0	}
280
281		struct bloom_keyvec bloom_keyvec_new(const char path, size_t len,
282		const struct bloom_filter_settings *settings)
283	0	{
284	0	struct bloom_keyvec *vec;
285	0	const char *p;
286	0	size_t sz;
287	0	size_t nr = 1;
288
289	0	p = path;
290	0	while (*p) {
291		/*
292		* At this point, the path is normalized to use Unix-style
293		* path separators. This is required due to how the
294		* changed-path Bloom filters store the paths.
295		*/
296	0	if (*p == '/')
297	0	nr++;
298	0	p++;
299	0	}
300
301	0	sz = sizeof(struct bloom_keyvec);
302	0	sz += nr * sizeof(struct bloom_key);
303	0	vec = (struct bloom_keyvec *)xcalloc(1, sz);
304	0	if (!vec)
305	0	return NULL;
306	0	vec->count = nr;
307
308	0	bloom_key_fill(&vec->key[0], path, len, settings);
309	0	nr = 1;
310	0	p = path + len - 1;
311	0	while (p > path) {
312	0	if (*p == '/') {
313	0	bloom_key_fill(&vec->key[nr++], path, p - path, settings);
314	0	}
315	0	p--;
316	0	}
317	0	assert(nr == vec->count);
318	0	return vec;
319	0	}
320
321		void bloom_keyvec_free(struct bloom_keyvec *vec)
322	0	{
323	0	if (!vec)
324	0	return;
325	0	for (size_t nr = 0; nr < vec->count; nr++)
326	0	bloom_key_clear(&vec->key[nr]);
327	0	free(vec);
328	0	}
329
330		static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
331		const struct hashmap_entry *eptr,
332		const struct hashmap_entry *entry_or_key,
333		const void *keydata UNUSED)
334	0	{
335	0	const struct pathmap_hash_entry e1, e2;
336
337	0	e1 = container_of(eptr, const struct pathmap_hash_entry, entry);
338	0	e2 = container_of(entry_or_key, const struct pathmap_hash_entry, entry);
339
340	0	return strcmp(e1->path, e2->path);
341	0	}
342
343		static void init_truncated_large_filter(struct bloom_filter *filter,
344		int version)
345	0	{
346	0	filter->data = filter->to_free = xmalloc(1);
347	0	filter->data[0] = 0xFF;
348	0	filter->len = 1;
349	0	filter->version = version;
350	0	}
351
352	0	#define VISITED (1u<<21)
353	0	#define HIGH_BITS (1u<<22)
354
355		static int has_entries_with_high_bit(struct repository r, struct tree t)
356	0	{
357	0	if (parse_tree(t))
358	0	return 1;
359
360	0	if (!(t->object.flags & VISITED)) {
361	0	struct tree_desc desc;
362	0	struct name_entry entry;
363
364	0	init_tree_desc(&desc, &t->object.oid, t->buffer, t->size);
365	0	while (tree_entry(&desc, &entry)) {
366	0	size_t i;
367	0	for (i = 0; i < entry.pathlen; i++) {
368	0	if (entry.path[i] & 0x80) {
369	0	t->object.flags \|= HIGH_BITS;
370	0	goto done;
371	0	}
372	0	}
373
374	0	if (S_ISDIR(entry.mode)) {
375	0	struct tree *sub = lookup_tree(r, &entry.oid);
376	0	if (sub && has_entries_with_high_bit(r, sub)) {
377	0	t->object.flags \|= HIGH_BITS;
378	0	goto done;
379	0	}
380	0	}
381
382	0	}
383
384	0	done:
385	0	t->object.flags \|= VISITED;
386	0	}
387
388	0	return !!(t->object.flags & HIGH_BITS);
389	0	}
390
391		static int commit_tree_has_high_bit_paths(struct repository *r,
392		struct commit *c)
393	0	{
394	0	struct tree *t;
395	0	if (repo_parse_commit(r, c))
396	0	return 1;
397	0	t = repo_get_commit_tree(r, c);
398	0	if (!t)
399	0	return 1;
400	0	return has_entries_with_high_bit(r, t);
401	0	}
402
403		static struct bloom_filter upgrade_filter(struct repository r, struct commit *c,
404		struct bloom_filter *filter,
405		int hash_version)
406	0	{
407	0	struct commit_list *p = c->parents;
408	0	if (commit_tree_has_high_bit_paths(r, c))
409	0	return NULL;
410
411	0	if (p && commit_tree_has_high_bit_paths(r, p->item))
412	0	return NULL;
413
414	0	filter->version = hash_version;
415
416	0	return filter;
417	0	}
418
419		struct bloom_filter get_bloom_filter(struct repository r, struct commit *c)
420	0	{
421	0	struct bloom_filter *filter;
422	0	int hash_version;
423
424	0	filter = get_or_compute_bloom_filter(r, c, 0, NULL, NULL);
425	0	if (!filter)
426	0	return NULL;
427
428	0	prepare_repo_settings(r);
429	0	hash_version = r->settings.commit_graph_changed_paths_version;
430
431	0	if (!(hash_version == -1 \|\| hash_version == filter->version))
432	0	return NULL; /* unusable filter */
433	0	return filter;
434	0	}
435
436		struct bloom_filter get_or_compute_bloom_filter(struct repository r,
437		struct commit *c,
438		int compute_if_not_present,
439		const struct bloom_filter_settings *settings,
440		enum bloom_filter_computed *computed)
441	0	{
442	0	struct bloom_filter *filter;
443	0	int i;
444	0	struct diff_options diffopt;
445
446	0	if (computed)
447	0	*computed = BLOOM_NOT_COMPUTED;
448
449	0	if (!bloom_filters.slab_size)
450	0	return NULL;
451
452	0	filter = bloom_filter_slab_at(&bloom_filters, c);
453
454	0	if (!filter->data) {
455	0	struct commit_graph *g;
456	0	uint32_t graph_pos;
457
458	0	g = repo_find_commit_pos_in_graph(r, c, &graph_pos);
459	0	if (g)
460	0	load_bloom_filter_from_graph(g, filter, graph_pos);
461	0	}
462
463	0	if (filter->data && filter->len) {
464	0	struct bloom_filter *upgrade;
465	0	if (!settings \|\| settings->hash_version == filter->version)
466	0	return filter;
467
468		/* version mismatch, see if we can upgrade */
469	0	if (compute_if_not_present &&
470	0	git_env_bool("GIT_TEST_UPGRADE_BLOOM_FILTERS", 1)) {
471	0	upgrade = upgrade_filter(r, c, filter,
472	0	settings->hash_version);
473	0	if (upgrade) {
474	0	if (computed)
475	0	*computed \|= BLOOM_UPGRADED;
476	0	return upgrade;
477	0	}
478	0	}
479	0	}
480	0	if (!compute_if_not_present)
481	0	return NULL;
482
483	0	repo_diff_setup(r, &diffopt);
484	0	diffopt.flags.recursive = 1;
485	0	diffopt.detect_rename = 0;
486	0	diffopt.max_changes = settings->max_changed_paths;
487	0	diff_setup_done(&diffopt);
488
489		/* ensure commit is parsed so we have parent information */
490	0	repo_parse_commit(r, c);
491
492	0	if (c->parents)
493	0	diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt);
494	0	else
495	0	diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
496	0	diffcore_std(&diffopt);
497
498	0	if (diff_queued_diff.nr <= settings->max_changed_paths) {
499	0	struct hashmap pathmap = HASHMAP_INIT(pathmap_cmp, NULL);
500	0	struct pathmap_hash_entry *e;
501	0	struct hashmap_iter iter;
502
503	0	for (i = 0; i < diff_queued_diff.nr; i++) {
504	0	const char *path = diff_queued_diff.queue[i]->two->path;
505
506		/*
507		* Add each leading directory of the changed file, i.e. for
508		* 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so
509		* the Bloom filter could be used to speed up commands like
510		* 'git log dir/subdir', too.
511		*
512		* Note that directories are added without the trailing '/'.
513		*/
514	0	do {
515	0	char *last_slash = strrchr(path, '/');
516
517	0	FLEX_ALLOC_STR(e, path, path);
518	0	hashmap_entry_init(&e->entry, strhash(path));
519
520	0	if (!hashmap_get(&pathmap, &e->entry, NULL))
521	0	hashmap_add(&pathmap, &e->entry);
522	0	else
523	0	free(e);
524
525	0	if (!last_slash)
526	0	last_slash = (char*)path;
527	0	*last_slash = '\0';
528
529	0	} while (*path);
530	0	}
531
532	0	if (hashmap_get_size(&pathmap) > settings->max_changed_paths) {
533	0	init_truncated_large_filter(filter,
534	0	settings->hash_version);
535	0	if (computed)
536	0	*computed \|= BLOOM_TRUNC_LARGE;
537	0	goto cleanup;
538	0	}
539
540	0	filter->len = (hashmap_get_size(&pathmap) * settings->bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
541	0	filter->version = settings->hash_version;
542	0	if (!filter->len) {
543	0	if (computed)
544	0	*computed \|= BLOOM_TRUNC_EMPTY;
545	0	filter->len = 1;
546	0	}
547	0	CALLOC_ARRAY(filter->data, filter->len);
548	0	filter->to_free = filter->data;
549
550	0	hashmap_for_each_entry(&pathmap, &iter, e, entry) {
551	0	struct bloom_key key;
552	0	bloom_key_fill(&key, e->path, strlen(e->path), settings);
553	0	add_key_to_filter(&key, filter, settings);
554	0	bloom_key_clear(&key);
555	0	}
556
557	0	cleanup:
558	0	hashmap_clear_and_free(&pathmap, struct pathmap_hash_entry, entry);
559	0	} else {
560	0	init_truncated_large_filter(filter, settings->hash_version);
561
562	0	if (computed)
563	0	*computed \|= BLOOM_TRUNC_LARGE;
564	0	}
565
566	0	if (computed)
567	0	*computed \|= BLOOM_COMPUTED;
568
569	0	diff_queue_clear(&diff_queued_diff);
570	0	return filter;
571	0	}
572
573		int bloom_filter_contains(const struct bloom_filter *filter,
574		const struct bloom_key *key,
575		const struct bloom_filter_settings *settings)
576	0	{
577	0	int i;
578	0	uint64_t mod = filter->len * BITS_PER_WORD;
579
580	0	if (!mod)
581	0	return -1;
582
583	0	for (i = 0; i < settings->num_hashes; i++) {
584	0	uint64_t hash_mod = key->hashes[i] % mod;
585	0	uint64_t block_pos = hash_mod / BITS_PER_WORD;
586	0	if (!(filter->data[block_pos] & get_bitmask(hash_mod)))
587	0	return 0;
588	0	}
589
590	0	return 1;
591	0	}
592
593		int bloom_filter_contains_vec(const struct bloom_filter *filter,
594		const struct bloom_keyvec *vec,
595		const struct bloom_filter_settings *settings)
596	0	{
597	0	int ret = 1;
598
599	0	for (size_t nr = 0; ret > 0 && nr < vec->count; nr++)
600	0	ret = bloom_filter_contains(filter, &vec->key[nr], settings);
601
602	0	return ret;
603	0	}
604
605		uint32_t test_bloom_murmur3_seeded(uint32_t seed, const char *data, size_t len,
606		int version)
607	0	{
608	0	assert(version == 1 \|\| version == 2);
609
610	0	if (version == 2)
611	0	return murmur3_seeded_v2(seed, data, len);
612	0	else
613	0	return murmur3_seeded_v1(seed, data, len);
614	0	}