/src/git/bloom.c

Source (jump to first uncovered line)
#include "git-compat-util.h"
#include "bloom.h"
#include "diff.h"
#include "diffcore.h"
#include "hashmap.h"
#include "commit-graph.h"
#include "commit.h"
#include "commit-slab.h"
#include "tree.h"
#include "tree-walk.h"
#include "config.h"
#include "repository.h"

define_commit_slab(bloom_filter_slab, struct bloom_filter);

static struct bloom_filter_slab bloom_filters;

struct pathmap_hash_entry {
    struct hashmap_entry entry;
    const char path[FLEX_ARRAY];
};

static uint32_t rotate_left(uint32_t value, int32_t count)
{
  uint32_t mask = 8 * sizeof(uint32_t) - 1;
  count &= mask;
  return ((value << count) | (value >> ((-count) & mask)));
}

static inline unsigned char get_bitmask(uint32_t pos)
{
  return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
}

static int check_bloom_offset(struct commit_graph *g, uint32_t pos,
            uint32_t offset)
{
  /*
   * Note that we allow offsets equal to the data size, which would set
   * our pointers at one past the end of the chunk memory. This is
   * necessary because the on-disk index points to the end of the
   * entries (so we can compute size by comparing adjacent ones). And
   * naturally the final entry's end is one-past-the-end of the chunk.
   */
  if (offset <= g->chunk_bloom_data_size - BLOOMDATA_CHUNK_HEADER_SIZE)
    return 0;

  warning("ignoring out-of-range offset (%"PRIuMAX") for changed-path"
    " filter at pos %"PRIuMAX" of %s (chunk size: %"PRIuMAX")",
    (uintmax_t)offset, (uintmax_t)pos,
    g->filename, (uintmax_t)g->chunk_bloom_data_size);
  return -1;
}

int load_bloom_filter_from_graph(struct commit_graph *g,
         struct bloom_filter *filter,
         uint32_t graph_pos)
{
  uint32_t lex_pos, start_index, end_index;

  while (graph_pos < g->num_commits_in_base)
    g = g->base_graph;

  /* The commit graph commit 'c' lives in doesn't carry Bloom filters. */
  if (!g->chunk_bloom_indexes)
    return 0;

  lex_pos = graph_pos - g->num_commits_in_base;

  end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos);

  if (lex_pos > 0)
    start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1));
  else
    start_index = 0;

  if (check_bloom_offset(g, lex_pos, end_index) < 0 ||
      check_bloom_offset(g, lex_pos - 1, start_index) < 0)
    return 0;

  if (end_index < start_index) {
    warning("ignoring decreasing changed-path index offsets"
      " (%"PRIuMAX" > %"PRIuMAX") for positions"
      " %"PRIuMAX" and %"PRIuMAX" of %s",
      (uintmax_t)start_index, (uintmax_t)end_index,
      (uintmax_t)(lex_pos-1), (uintmax_t)lex_pos,
      g->filename);
    return 0;
  }

  filter->len = end_index - start_index;
  filter->data = (unsigned char *)(g->chunk_bloom_data +
          sizeof(unsigned char) * start_index +
          BLOOMDATA_CHUNK_HEADER_SIZE);
  filter->version = g->bloom_filter_settings->hash_version;
  filter->to_free = NULL;

  return 1;
}

/*
 * Calculate the murmur3 32-bit hash value for the given data
 * using the given seed.
 * Produces a uniformly distributed hash value.
 * Not considered to be cryptographically secure.
 * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
 */
uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len)
{
  const uint32_t c1 = 0xcc9e2d51;
  const uint32_t c2 = 0x1b873593;
  const uint32_t r1 = 15;
  const uint32_t r2 = 13;
  const uint32_t m = 5;
  const uint32_t n = 0xe6546b64;
  int i;
  uint32_t k1 = 0;
  const char *tail;

  int len4 = len / sizeof(uint32_t);

  uint32_t k;
  for (i = 0; i < len4; i++) {
    uint32_t byte1 = (uint32_t)(unsigned char)data[4*i];
    uint32_t byte2 = ((uint32_t)(unsigned char)data[4*i + 1]) << 8;
    uint32_t byte3 = ((uint32_t)(unsigned char)data[4*i + 2]) << 16;
    uint32_t byte4 = ((uint32_t)(unsigned char)data[4*i + 3]) << 24;
    k = byte1 | byte2 | byte3 | byte4;
    k *= c1;
    k = rotate_left(k, r1);
    k *= c2;

    seed ^= k;
    seed = rotate_left(seed, r2) * m + n;
  }

  tail = (data + len4 * sizeof(uint32_t));

  switch (len & (sizeof(uint32_t) - 1)) {
  case 3:
    k1 ^= ((uint32_t)(unsigned char)tail[2]) << 16;
    /*-fallthrough*/
  case 2:
    k1 ^= ((uint32_t)(unsigned char)tail[1]) << 8;
    /*-fallthrough*/
  case 1:
    k1 ^= ((uint32_t)(unsigned char)tail[0]) << 0;
    k1 *= c1;
    k1 = rotate_left(k1, r1);
    k1 *= c2;
    seed ^= k1;
    break;
  }

  seed ^= (uint32_t)len;
  seed ^= (seed >> 16);
  seed *= 0x85ebca6b;
  seed ^= (seed >> 13);
  seed *= 0xc2b2ae35;
  seed ^= (seed >> 16);

  return seed;
}

static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len)
{
  const uint32_t c1 = 0xcc9e2d51;
  const uint32_t c2 = 0x1b873593;
  const uint32_t r1 = 15;
  const uint32_t r2 = 13;
  const uint32_t m = 5;
  const uint32_t n = 0xe6546b64;
  int i;
  uint32_t k1 = 0;
  const char *tail;

  int len4 = len / sizeof(uint32_t);

  uint32_t k;
  for (i = 0; i < len4; i++) {
    uint32_t byte1 = (uint32_t)data[4*i];
    uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8;
    uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16;
    uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24;
    k = byte1 | byte2 | byte3 | byte4;
    k *= c1;
    k = rotate_left(k, r1);
    k *= c2;

    seed ^= k;
    seed = rotate_left(seed, r2) * m + n;
  }

  tail = (data + len4 * sizeof(uint32_t));

  switch (len & (sizeof(uint32_t) - 1)) {
  case 3:
    k1 ^= ((uint32_t)tail[2]) << 16;
    /*-fallthrough*/
  case 2:
    k1 ^= ((uint32_t)tail[1]) << 8;
    /*-fallthrough*/
  case 1:
    k1 ^= ((uint32_t)tail[0]) << 0;
    k1 *= c1;
    k1 = rotate_left(k1, r1);
    k1 *= c2;
    seed ^= k1;
    break;
  }

  seed ^= (uint32_t)len;
  seed ^= (seed >> 16);
  seed *= 0x85ebca6b;
  seed ^= (seed >> 13);
  seed *= 0xc2b2ae35;
  seed ^= (seed >> 16);

  return seed;
}

void fill_bloom_key(const char *data,
        size_t len,
        struct bloom_key *key,
        const struct bloom_filter_settings *settings)
{
  int i;
  const uint32_t seed0 = 0x293ae76f;
  const uint32_t seed1 = 0x7e646e2c;
  uint32_t hash0, hash1;
  if (settings->hash_version == 2) {
    hash0 = murmur3_seeded_v2(seed0, data, len);
    hash1 = murmur3_seeded_v2(seed1, data, len);
  } else {
    hash0 = murmur3_seeded_v1(seed0, data, len);
    hash1 = murmur3_seeded_v1(seed1, data, len);
  }

  key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
  for (i = 0; i < settings->num_hashes; i++)
    key->hashes[i] = hash0 + i * hash1;
}

void clear_bloom_key(struct bloom_key *key)
{
  FREE_AND_NULL(key->hashes);
}

void add_key_to_filter(const struct bloom_key *key,
           struct bloom_filter *filter,
           const struct bloom_filter_settings *settings)
{
  int i;
  uint64_t mod = filter->len * BITS_PER_WORD;

  for (i = 0; i < settings->num_hashes; i++) {
    uint64_t hash_mod = key->hashes[i] % mod;
    uint64_t block_pos = hash_mod / BITS_PER_WORD;

    filter->data[block_pos] |= get_bitmask(hash_mod);
  }
}

void init_bloom_filters(void)
{
  init_bloom_filter_slab(&bloom_filters);
}

static void free_one_bloom_filter(struct bloom_filter *filter)
{
  if (!filter)
    return;
  free(filter->to_free);
}

void deinit_bloom_filters(void)
{
  deep_clear_bloom_filter_slab(&bloom_filters, free_one_bloom_filter);
}

static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
           const struct hashmap_entry *eptr,
           const struct hashmap_entry *entry_or_key,
           const void *keydata UNUSED)
{
  const struct pathmap_hash_entry *e1, *e2;

  e1 = container_of(eptr, const struct pathmap_hash_entry, entry);
  e2 = container_of(entry_or_key, const struct pathmap_hash_entry, entry);

  return strcmp(e1->path, e2->path);
}

static void init_truncated_large_filter(struct bloom_filter *filter,
          int version)
{
  filter->data = filter->to_free = xmalloc(1);
  filter->data[0] = 0xFF;
  filter->len = 1;
  filter->version = version;
}

#define VISITED   (1u<<21)
#define HIGH_BITS (1u<<22)

static int has_entries_with_high_bit(struct repository *r, struct tree *t)
{
  if (parse_tree(t))
    return 1;

  if (!(t->object.flags & VISITED)) {
    struct tree_desc desc;
    struct name_entry entry;

    init_tree_desc(&desc, &t->object.oid, t->buffer, t->size);
    while (tree_entry(&desc, &entry)) {
      size_t i;
      for (i = 0; i < entry.pathlen; i++) {
        if (entry.path[i] & 0x80) {
          t->object.flags |= HIGH_BITS;
          goto done;
        }
      }

      if (S_ISDIR(entry.mode)) {
        struct tree *sub = lookup_tree(r, &entry.oid);
        if (sub && has_entries_with_high_bit(r, sub)) {
          t->object.flags |= HIGH_BITS;
          goto done;
        }
      }

    }

done:
    t->object.flags |= VISITED;
  }

  return !!(t->object.flags & HIGH_BITS);
}

static int commit_tree_has_high_bit_paths(struct repository *r,
            struct commit *c)
{
  struct tree *t;
  if (repo_parse_commit(r, c))
    return 1;
  t = repo_get_commit_tree(r, c);
  if (!t)
    return 1;
  return has_entries_with_high_bit(r, t);
}

static struct bloom_filter *upgrade_filter(struct repository *r, struct commit *c,
             struct bloom_filter *filter,
             int hash_version)
{
  struct commit_list *p = c->parents;
  if (commit_tree_has_high_bit_paths(r, c))
    return NULL;

  if (p && commit_tree_has_high_bit_paths(r, p->item))
    return NULL;

  filter->version = hash_version;

  return filter;
}

struct bloom_filter *get_bloom_filter(struct repository *r, struct commit *c)
{
  struct bloom_filter *filter;
  int hash_version;

  filter = get_or_compute_bloom_filter(r, c, 0, NULL, NULL);
  if (!filter)
    return NULL;

  prepare_repo_settings(r);
  hash_version = r->settings.commit_graph_changed_paths_version;

  if (!(hash_version == -1 || hash_version == filter->version))
    return NULL; /* unusable filter */
  return filter;
}

struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
             struct commit *c,
             int compute_if_not_present,
             const struct bloom_filter_settings *settings,
             enum bloom_filter_computed *computed)
{
  struct bloom_filter *filter;
  int i;
  struct diff_options diffopt;

  if (computed)
    *computed = BLOOM_NOT_COMPUTED;

  if (!bloom_filters.slab_size)
    return NULL;

  filter = bloom_filter_slab_at(&bloom_filters, c);

  if (!filter->data) {
    uint32_t graph_pos;
    if (repo_find_commit_pos_in_graph(r, c, &graph_pos))
      load_bloom_filter_from_graph(r->objects->commit_graph,
                 filter, graph_pos);
  }

  if (filter->data && filter->len) {
    struct bloom_filter *upgrade;
    if (!settings || settings->hash_version == filter->version)
      return filter;

    /* version mismatch, see if we can upgrade */
    if (compute_if_not_present &&
        git_env_bool("GIT_TEST_UPGRADE_BLOOM_FILTERS", 1)) {
      upgrade = upgrade_filter(r, c, filter,
             settings->hash_version);
      if (upgrade) {
        if (computed)
          *computed |= BLOOM_UPGRADED;
        return upgrade;
      }
    }
  }
  if (!compute_if_not_present)
    return NULL;

  repo_diff_setup(r, &diffopt);
  diffopt.flags.recursive = 1;
  diffopt.detect_rename = 0;
  diffopt.max_changes = settings->max_changed_paths;
  diff_setup_done(&diffopt);

  /* ensure commit is parsed so we have parent information */
  repo_parse_commit(r, c);

  if (c->parents)
    diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt);
  else
    diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
  diffcore_std(&diffopt);

  if (diff_queued_diff.nr <= settings->max_changed_paths) {
    struct hashmap pathmap = HASHMAP_INIT(pathmap_cmp, NULL);
    struct pathmap_hash_entry *e;
    struct hashmap_iter iter;

    for (i = 0; i < diff_queued_diff.nr; i++) {
      const char *path = diff_queued_diff.queue[i]->two->path;

      /*
       * Add each leading directory of the changed file, i.e. for
       * 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so
       * the Bloom filter could be used to speed up commands like
       * 'git log dir/subdir', too.
       *
       * Note that directories are added without the trailing '/'.
       */
      do {
        char *last_slash = strrchr(path, '/');

        FLEX_ALLOC_STR(e, path, path);
        hashmap_entry_init(&e->entry, strhash(path));

        if (!hashmap_get(&pathmap, &e->entry, NULL))
          hashmap_add(&pathmap, &e->entry);
        else
          free(e);

        if (!last_slash)
          last_slash = (char*)path;
        *last_slash = '\0';

      } while (*path);

      diff_free_filepair(diff_queued_diff.queue[i]);
    }

    if (hashmap_get_size(&pathmap) > settings->max_changed_paths) {
      init_truncated_large_filter(filter,
                settings->hash_version);
      if (computed)
        *computed |= BLOOM_TRUNC_LARGE;
      goto cleanup;
    }

    filter->len = (hashmap_get_size(&pathmap) * settings->bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
    filter->version = settings->hash_version;
    if (!filter->len) {
      if (computed)
        *computed |= BLOOM_TRUNC_EMPTY;
      filter->len = 1;
    }
    CALLOC_ARRAY(filter->data, filter->len);
    filter->to_free = filter->data;

    hashmap_for_each_entry(&pathmap, &iter, e, entry) {
      struct bloom_key key;
      fill_bloom_key(e->path, strlen(e->path), &key, settings);
      add_key_to_filter(&key, filter, settings);
      clear_bloom_key(&key);
    }

  cleanup:
    hashmap_clear_and_free(&pathmap, struct pathmap_hash_entry, entry);
  } else {
    for (i = 0; i < diff_queued_diff.nr; i++)
      diff_free_filepair(diff_queued_diff.queue[i]);
    init_truncated_large_filter(filter, settings->hash_version);

    if (computed)
      *computed |= BLOOM_TRUNC_LARGE;
  }

  if (computed)
    *computed |= BLOOM_COMPUTED;

  free(diff_queued_diff.queue);
  DIFF_QUEUE_CLEAR(&diff_queued_diff);

  return filter;
}

int bloom_filter_contains(const struct bloom_filter *filter,
        const struct bloom_key *key,
        const struct bloom_filter_settings *settings)
{
  int i;
  uint64_t mod = filter->len * BITS_PER_WORD;

  if (!mod)
    return -1;

  for (i = 0; i < settings->num_hashes; i++) {
    uint64_t hash_mod = key->hashes[i] % mod;
    uint64_t block_pos = hash_mod / BITS_PER_WORD;
    if (!(filter->data[block_pos] & get_bitmask(hash_mod)))
      return 0;
  }

  return 1;
}

Coverage Report

Created: 2024-09-08 06:23

Line	Count	Source (jump to first uncovered line)
1		#include "git-compat-util.h"
2		#include "bloom.h"
3		#include "diff.h"
4		#include "diffcore.h"
5		#include "hashmap.h"
6		#include "commit-graph.h"
7		#include "commit.h"
8		#include "commit-slab.h"
9		#include "tree.h"
10		#include "tree-walk.h"
11		#include "config.h"
12		#include "repository.h"
13
14		define_commit_slab(bloom_filter_slab, struct bloom_filter);
15
16		static struct bloom_filter_slab bloom_filters;
17
18		struct pathmap_hash_entry {
19		struct hashmap_entry entry;
20		const char path[FLEX_ARRAY];
21		};
22
23		static uint32_t rotate_left(uint32_t value, int32_t count)
24	0	{
25	0	uint32_t mask = 8 * sizeof(uint32_t) - 1;
26	0	count &= mask;
27	0	return ((value << count) \| (value >> ((-count) & mask)));
28	0	}
29
30		static inline unsigned char get_bitmask(uint32_t pos)
31	0	{
32	0	return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
33	0	}
34
35		static int check_bloom_offset(struct commit_graph *g, uint32_t pos,
36		uint32_t offset)
37	0	{
38		/*
39		* Note that we allow offsets equal to the data size, which would set
40		* our pointers at one past the end of the chunk memory. This is
41		* necessary because the on-disk index points to the end of the
42		* entries (so we can compute size by comparing adjacent ones). And
43		* naturally the final entry's end is one-past-the-end of the chunk.
44		*/
45	0	if (offset <= g->chunk_bloom_data_size - BLOOMDATA_CHUNK_HEADER_SIZE)
46	0	return 0;
47
48	0	warning("ignoring out-of-range offset (%"PRIuMAX") for changed-path"
49	0	" filter at pos %"PRIuMAX" of %s (chunk size: %"PRIuMAX")",
50	0	(uintmax_t)offset, (uintmax_t)pos,
51	0	g->filename, (uintmax_t)g->chunk_bloom_data_size);
52	0	return -1;
53	0	}
54
55		int load_bloom_filter_from_graph(struct commit_graph *g,
56		struct bloom_filter *filter,
57		uint32_t graph_pos)
58	0	{
59	0	uint32_t lex_pos, start_index, end_index;
60
61	0	while (graph_pos < g->num_commits_in_base)
62	0	g = g->base_graph;
63
64		/* The commit graph commit 'c' lives in doesn't carry Bloom filters. */
65	0	if (!g->chunk_bloom_indexes)
66	0	return 0;
67
68	0	lex_pos = graph_pos - g->num_commits_in_base;
69
70	0	end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos);
71
72	0	if (lex_pos > 0)
73	0	start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1));
74	0	else
75	0	start_index = 0;
76
77	0	if (check_bloom_offset(g, lex_pos, end_index) < 0 \|\|
78	0	check_bloom_offset(g, lex_pos - 1, start_index) < 0)
79	0	return 0;
80
81	0	if (end_index < start_index) {
82	0	warning("ignoring decreasing changed-path index offsets"
83	0	" (%"PRIuMAX" > %"PRIuMAX") for positions"
84	0	" %"PRIuMAX" and %"PRIuMAX" of %s",
85	0	(uintmax_t)start_index, (uintmax_t)end_index,
86	0	(uintmax_t)(lex_pos-1), (uintmax_t)lex_pos,
87	0	g->filename);
88	0	return 0;
89	0	}
90
91	0	filter->len = end_index - start_index;
92	0	filter->data = (unsigned char *)(g->chunk_bloom_data +
93	0	sizeof(unsigned char) * start_index +
94	0	BLOOMDATA_CHUNK_HEADER_SIZE);
95	0	filter->version = g->bloom_filter_settings->hash_version;
96	0	filter->to_free = NULL;
97
98	0	return 1;
99	0	}
100
101		/*
102		* Calculate the murmur3 32-bit hash value for the given data
103		* using the given seed.
104		* Produces a uniformly distributed hash value.
105		* Not considered to be cryptographically secure.
106		* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
107		*/
108		uint32_t murmur3_seeded_v2(uint32_t seed, const char *data, size_t len)
109	0	{
110	0	const uint32_t c1 = 0xcc9e2d51;
111	0	const uint32_t c2 = 0x1b873593;
112	0	const uint32_t r1 = 15;
113	0	const uint32_t r2 = 13;
114	0	const uint32_t m = 5;
115	0	const uint32_t n = 0xe6546b64;
116	0	int i;
117	0	uint32_t k1 = 0;
118	0	const char *tail;
119
120	0	int len4 = len / sizeof(uint32_t);
121
122	0	uint32_t k;
123	0	for (i = 0; i < len4; i++) {
124	0	uint32_t byte1 = (uint32_t)(unsigned char)data[4*i];
125	0	uint32_t byte2 = ((uint32_t)(unsigned char)data[4*i + 1]) << 8;
126	0	uint32_t byte3 = ((uint32_t)(unsigned char)data[4*i + 2]) << 16;
127	0	uint32_t byte4 = ((uint32_t)(unsigned char)data[4*i + 3]) << 24;
128	0	k = byte1 \| byte2 \| byte3 \| byte4;
129	0	k *= c1;
130	0	k = rotate_left(k, r1);
131	0	k *= c2;
132
133	0	seed ^= k;
134	0	seed = rotate_left(seed, r2) * m + n;
135	0	}
136
137	0	tail = (data + len4 * sizeof(uint32_t));
138
139	0	switch (len & (sizeof(uint32_t) - 1)) {
140	0	case 3:
141	0	k1 ^= ((uint32_t)(unsigned char)tail[2]) << 16;
142		/-fallthrough/
143	0	case 2:
144	0	k1 ^= ((uint32_t)(unsigned char)tail[1]) << 8;
145		/-fallthrough/
146	0	case 1:
147	0	k1 ^= ((uint32_t)(unsigned char)tail[0]) << 0;
148	0	k1 *= c1;
149	0	k1 = rotate_left(k1, r1);
150	0	k1 *= c2;
151	0	seed ^= k1;
152	0	break;
153	0	}
154
155	0	seed ^= (uint32_t)len;
156	0	seed ^= (seed >> 16);
157	0	seed *= 0x85ebca6b;
158	0	seed ^= (seed >> 13);
159	0	seed *= 0xc2b2ae35;
160	0	seed ^= (seed >> 16);
161
162	0	return seed;
163	0	}
164
165		static uint32_t murmur3_seeded_v1(uint32_t seed, const char *data, size_t len)
166	0	{
167	0	const uint32_t c1 = 0xcc9e2d51;
168	0	const uint32_t c2 = 0x1b873593;
169	0	const uint32_t r1 = 15;
170	0	const uint32_t r2 = 13;
171	0	const uint32_t m = 5;
172	0	const uint32_t n = 0xe6546b64;
173	0	int i;
174	0	uint32_t k1 = 0;
175	0	const char *tail;
176
177	0	int len4 = len / sizeof(uint32_t);
178
179	0	uint32_t k;
180	0	for (i = 0; i < len4; i++) {
181	0	uint32_t byte1 = (uint32_t)data[4*i];
182	0	uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8;
183	0	uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16;
184	0	uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24;
185	0	k = byte1 \| byte2 \| byte3 \| byte4;
186	0	k *= c1;
187	0	k = rotate_left(k, r1);
188	0	k *= c2;
189
190	0	seed ^= k;
191	0	seed = rotate_left(seed, r2) * m + n;
192	0	}
193
194	0	tail = (data + len4 * sizeof(uint32_t));
195
196	0	switch (len & (sizeof(uint32_t) - 1)) {
197	0	case 3:
198	0	k1 ^= ((uint32_t)tail[2]) << 16;
199		/-fallthrough/
200	0	case 2:
201	0	k1 ^= ((uint32_t)tail[1]) << 8;
202		/-fallthrough/
203	0	case 1:
204	0	k1 ^= ((uint32_t)tail[0]) << 0;
205	0	k1 *= c1;
206	0	k1 = rotate_left(k1, r1);
207	0	k1 *= c2;
208	0	seed ^= k1;
209	0	break;
210	0	}
211
212	0	seed ^= (uint32_t)len;
213	0	seed ^= (seed >> 16);
214	0	seed *= 0x85ebca6b;
215	0	seed ^= (seed >> 13);
216	0	seed *= 0xc2b2ae35;
217	0	seed ^= (seed >> 16);
218
219	0	return seed;
220	0	}
221
222		void fill_bloom_key(const char *data,
223		size_t len,
224		struct bloom_key *key,
225		const struct bloom_filter_settings *settings)
226	0	{
227	0	int i;
228	0	const uint32_t seed0 = 0x293ae76f;
229	0	const uint32_t seed1 = 0x7e646e2c;
230	0	uint32_t hash0, hash1;
231	0	if (settings->hash_version == 2) {
232	0	hash0 = murmur3_seeded_v2(seed0, data, len);
233	0	hash1 = murmur3_seeded_v2(seed1, data, len);
234	0	} else {
235	0	hash0 = murmur3_seeded_v1(seed0, data, len);
236	0	hash1 = murmur3_seeded_v1(seed1, data, len);
237	0	}
238
239	0	key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
240	0	for (i = 0; i < settings->num_hashes; i++)
241	0	key->hashes[i] = hash0 + i * hash1;
242	0	}
243
244		void clear_bloom_key(struct bloom_key *key)
245	0	{
246	0	FREE_AND_NULL(key->hashes);
247	0	}
248
249		void add_key_to_filter(const struct bloom_key *key,
250		struct bloom_filter *filter,
251		const struct bloom_filter_settings *settings)
252	0	{
253	0	int i;
254	0	uint64_t mod = filter->len * BITS_PER_WORD;
255
256	0	for (i = 0; i < settings->num_hashes; i++) {
257	0	uint64_t hash_mod = key->hashes[i] % mod;
258	0	uint64_t block_pos = hash_mod / BITS_PER_WORD;
259
260	0	filter->data[block_pos] \|= get_bitmask(hash_mod);
261	0	}
262	0	}
263
264		void init_bloom_filters(void)
265	0	{
266	0	init_bloom_filter_slab(&bloom_filters);
267	0	}
268
269		static void free_one_bloom_filter(struct bloom_filter *filter)
270	0	{
271	0	if (!filter)
272	0	return;
273	0	free(filter->to_free);
274	0	}
275
276		void deinit_bloom_filters(void)
277	0	{
278	0	deep_clear_bloom_filter_slab(&bloom_filters, free_one_bloom_filter);
279	0	}
280
281		static int pathmap_cmp(const void *hashmap_cmp_fn_data UNUSED,
282		const struct hashmap_entry *eptr,
283		const struct hashmap_entry *entry_or_key,
284		const void *keydata UNUSED)
285	0	{
286	0	const struct pathmap_hash_entry e1, e2;
287
288	0	e1 = container_of(eptr, const struct pathmap_hash_entry, entry);
289	0	e2 = container_of(entry_or_key, const struct pathmap_hash_entry, entry);
290
291	0	return strcmp(e1->path, e2->path);
292	0	}
293
294		static void init_truncated_large_filter(struct bloom_filter *filter,
295		int version)
296	0	{
297	0	filter->data = filter->to_free = xmalloc(1);
298	0	filter->data[0] = 0xFF;
299	0	filter->len = 1;
300	0	filter->version = version;
301	0	}
302
303	0	#define VISITED (1u<<21)
304	0	#define HIGH_BITS (1u<<22)
305
306		static int has_entries_with_high_bit(struct repository r, struct tree t)
307	0	{
308	0	if (parse_tree(t))
309	0	return 1;
310
311	0	if (!(t->object.flags & VISITED)) {
312	0	struct tree_desc desc;
313	0	struct name_entry entry;
314
315	0	init_tree_desc(&desc, &t->object.oid, t->buffer, t->size);
316	0	while (tree_entry(&desc, &entry)) {
317	0	size_t i;
318	0	for (i = 0; i < entry.pathlen; i++) {
319	0	if (entry.path[i] & 0x80) {
320	0	t->object.flags \|= HIGH_BITS;
321	0	goto done;
322	0	}
323	0	}
324
325	0	if (S_ISDIR(entry.mode)) {
326	0	struct tree *sub = lookup_tree(r, &entry.oid);
327	0	if (sub && has_entries_with_high_bit(r, sub)) {
328	0	t->object.flags \|= HIGH_BITS;
329	0	goto done;
330	0	}
331	0	}
332
333	0	}
334
335	0	done:
336	0	t->object.flags \|= VISITED;
337	0	}
338
339	0	return !!(t->object.flags & HIGH_BITS);
340	0	}
341
342		static int commit_tree_has_high_bit_paths(struct repository *r,
343		struct commit *c)
344	0	{
345	0	struct tree *t;
346	0	if (repo_parse_commit(r, c))
347	0	return 1;
348	0	t = repo_get_commit_tree(r, c);
349	0	if (!t)
350	0	return 1;
351	0	return has_entries_with_high_bit(r, t);
352	0	}
353
354		static struct bloom_filter upgrade_filter(struct repository r, struct commit *c,
355		struct bloom_filter *filter,
356		int hash_version)
357	0	{
358	0	struct commit_list *p = c->parents;
359	0	if (commit_tree_has_high_bit_paths(r, c))
360	0	return NULL;
361
362	0	if (p && commit_tree_has_high_bit_paths(r, p->item))
363	0	return NULL;
364
365	0	filter->version = hash_version;
366
367	0	return filter;
368	0	}
369
370		struct bloom_filter get_bloom_filter(struct repository r, struct commit *c)
371	0	{
372	0	struct bloom_filter *filter;
373	0	int hash_version;
374
375	0	filter = get_or_compute_bloom_filter(r, c, 0, NULL, NULL);
376	0	if (!filter)
377	0	return NULL;
378
379	0	prepare_repo_settings(r);
380	0	hash_version = r->settings.commit_graph_changed_paths_version;
381
382	0	if (!(hash_version == -1 \|\| hash_version == filter->version))
383	0	return NULL; /* unusable filter */
384	0	return filter;
385	0	}
386
387		struct bloom_filter get_or_compute_bloom_filter(struct repository r,
388		struct commit *c,
389		int compute_if_not_present,
390		const struct bloom_filter_settings *settings,
391		enum bloom_filter_computed *computed)
392	0	{
393	0	struct bloom_filter *filter;
394	0	int i;
395	0	struct diff_options diffopt;
396
397	0	if (computed)
398	0	*computed = BLOOM_NOT_COMPUTED;
399
400	0	if (!bloom_filters.slab_size)
401	0	return NULL;
402
403	0	filter = bloom_filter_slab_at(&bloom_filters, c);
404
405	0	if (!filter->data) {
406	0	uint32_t graph_pos;
407	0	if (repo_find_commit_pos_in_graph(r, c, &graph_pos))
408	0	load_bloom_filter_from_graph(r->objects->commit_graph,
409	0	filter, graph_pos);
410	0	}
411
412	0	if (filter->data && filter->len) {
413	0	struct bloom_filter *upgrade;
414	0	if (!settings \|\| settings->hash_version == filter->version)
415	0	return filter;
416
417		/* version mismatch, see if we can upgrade */
418	0	if (compute_if_not_present &&
419	0	git_env_bool("GIT_TEST_UPGRADE_BLOOM_FILTERS", 1)) {
420	0	upgrade = upgrade_filter(r, c, filter,
421	0	settings->hash_version);
422	0	if (upgrade) {
423	0	if (computed)
424	0	*computed \|= BLOOM_UPGRADED;
425	0	return upgrade;
426	0	}
427	0	}
428	0	}
429	0	if (!compute_if_not_present)
430	0	return NULL;
431
432	0	repo_diff_setup(r, &diffopt);
433	0	diffopt.flags.recursive = 1;
434	0	diffopt.detect_rename = 0;
435	0	diffopt.max_changes = settings->max_changed_paths;
436	0	diff_setup_done(&diffopt);
437
438		/* ensure commit is parsed so we have parent information */
439	0	repo_parse_commit(r, c);
440
441	0	if (c->parents)
442	0	diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt);
443	0	else
444	0	diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
445	0	diffcore_std(&diffopt);
446
447	0	if (diff_queued_diff.nr <= settings->max_changed_paths) {
448	0	struct hashmap pathmap = HASHMAP_INIT(pathmap_cmp, NULL);
449	0	struct pathmap_hash_entry *e;
450	0	struct hashmap_iter iter;
451
452	0	for (i = 0; i < diff_queued_diff.nr; i++) {
453	0	const char *path = diff_queued_diff.queue[i]->two->path;
454
455		/*
456		* Add each leading directory of the changed file, i.e. for
457		* 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so
458		* the Bloom filter could be used to speed up commands like
459		* 'git log dir/subdir', too.
460		*
461		* Note that directories are added without the trailing '/'.
462		*/
463	0	do {
464	0	char *last_slash = strrchr(path, '/');
465
466	0	FLEX_ALLOC_STR(e, path, path);
467	0	hashmap_entry_init(&e->entry, strhash(path));
468
469	0	if (!hashmap_get(&pathmap, &e->entry, NULL))
470	0	hashmap_add(&pathmap, &e->entry);
471	0	else
472	0	free(e);
473
474	0	if (!last_slash)
475	0	last_slash = (char*)path;
476	0	*last_slash = '\0';
477
478	0	} while (*path);
479
480	0	diff_free_filepair(diff_queued_diff.queue[i]);
481	0	}
482
483	0	if (hashmap_get_size(&pathmap) > settings->max_changed_paths) {
484	0	init_truncated_large_filter(filter,
485	0	settings->hash_version);
486	0	if (computed)
487	0	*computed \|= BLOOM_TRUNC_LARGE;
488	0	goto cleanup;
489	0	}
490
491	0	filter->len = (hashmap_get_size(&pathmap) * settings->bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
492	0	filter->version = settings->hash_version;
493	0	if (!filter->len) {
494	0	if (computed)
495	0	*computed \|= BLOOM_TRUNC_EMPTY;
496	0	filter->len = 1;
497	0	}
498	0	CALLOC_ARRAY(filter->data, filter->len);
499	0	filter->to_free = filter->data;
500
501	0	hashmap_for_each_entry(&pathmap, &iter, e, entry) {
502	0	struct bloom_key key;
503	0	fill_bloom_key(e->path, strlen(e->path), &key, settings);
504	0	add_key_to_filter(&key, filter, settings);
505	0	clear_bloom_key(&key);
506	0	}
507
508	0	cleanup:
509	0	hashmap_clear_and_free(&pathmap, struct pathmap_hash_entry, entry);
510	0	} else {
511	0	for (i = 0; i < diff_queued_diff.nr; i++)
512	0	diff_free_filepair(diff_queued_diff.queue[i]);
513	0	init_truncated_large_filter(filter, settings->hash_version);
514
515	0	if (computed)
516	0	*computed \|= BLOOM_TRUNC_LARGE;
517	0	}
518
519	0	if (computed)
520	0	*computed \|= BLOOM_COMPUTED;
521
522	0	free(diff_queued_diff.queue);
523	0	DIFF_QUEUE_CLEAR(&diff_queued_diff);
524
525	0	return filter;
526	0	}
527
528		int bloom_filter_contains(const struct bloom_filter *filter,
529		const struct bloom_key *key,
530		const struct bloom_filter_settings *settings)
531	0	{
532	0	int i;
533	0	uint64_t mod = filter->len * BITS_PER_WORD;
534
535	0	if (!mod)
536	0	return -1;
537
538	0	for (i = 0; i < settings->num_hashes; i++) {
539	0	uint64_t hash_mod = key->hashes[i] % mod;
540	0	uint64_t block_pos = hash_mod / BITS_PER_WORD;
541	0	if (!(filter->data[block_pos] & get_bitmask(hash_mod)))
542	0	return 0;
543	0	}
544
545	0	return 1;
546	0	}