/src/rocksdb/util/hash.cc

Source (jump to first uncovered line)
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include "util/hash.h"

#include <string>

#include "port/lang.h"
#include "util/coding.h"
#include "util/hash128.h"
#include "util/math128.h"
#include "util/xxhash.h"
#include "util/xxph3.h"

namespace ROCKSDB_NAMESPACE {

uint64_t (*kGetSliceNPHash64UnseededFnPtr)(const Slice&) = &GetSliceHash64;

uint32_t Hash(const char* data, size_t n, uint32_t seed) {
  // MurmurHash1 - fast but mediocre quality
  // https://github.com/aappleby/smhasher/wiki/MurmurHash1
  //
  const uint32_t m = 0xc6a4a793;
  const uint32_t r = 24;
  const char* limit = data + n;
  uint32_t h = static_cast<uint32_t>(seed ^ (n * m));

  // Pick up four bytes at a time
  while (data + 4 <= limit) {
    uint32_t w = DecodeFixed32(data);
    data += 4;
    h += w;
    h *= m;
    h ^= (h >> 16);
  }

  // Pick up remaining bytes
  switch (limit - data) {
    // Note: The original hash implementation used data[i] << shift, which
    // promotes the char to int and then performs the shift. If the char is
    // negative, the shift is undefined behavior in C++. The hash algorithm is
    // part of the format definition, so we cannot change it; to obtain the same
    // behavior in a legal way we just cast to uint32_t, which will do
    // sign-extension. To guarantee compatibility with architectures where chars
    // are unsigned we first cast the char to int8_t.
    case 3:
      h += static_cast<uint32_t>(static_cast<int8_t>(data[2])) << 16;
      FALLTHROUGH_INTENDED;
    case 2:
      h += static_cast<uint32_t>(static_cast<int8_t>(data[1])) << 8;
      FALLTHROUGH_INTENDED;
    case 1:
      h += static_cast<uint32_t>(static_cast<int8_t>(data[0]));
      h *= m;
      h ^= (h >> r);
      break;
  }
  return h;
}

// We are standardizing on a preview release of XXH3, because that's
// the best available at time of standardizing.
//
// In testing (mostly Intel Skylake), this hash function is much more
// thorough than Hash32 and is almost universally faster. Hash() only
// seems faster when passing runtime-sized keys of the same small size
// (less than about 24 bytes) thousands of times in a row; this seems
// to allow the branch predictor to work some magic. XXH3's speed is
// much less dependent on branch prediction.
//
// Hashing with a prefix extractor is potentially a common case of
// hashing objects of small, predictable size. We could consider
// bundling hash functions specialized for particular lengths with
// the prefix extractors.
uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
  return XXPH3_64bits_withSeed(data, n, seed);
}

uint64_t Hash64(const char* data, size_t n) {
  // Same as seed = 0
  return XXPH3_64bits(data, n);
}

uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed) {
  // TODO(ajkr): use XXH3 streaming APIs to avoid the copy/allocation.
  size_t concat_len = 0;
  for (int i = 0; i < data.num_parts; ++i) {
    concat_len += data.parts[i].size();
  }
  std::string concat_data;
  concat_data.reserve(concat_len);
  for (int i = 0; i < data.num_parts; ++i) {
    concat_data.append(data.parts[i].data(), data.parts[i].size());
  }
  assert(concat_data.size() == concat_len);
  return NPHash64(concat_data.data(), concat_len, seed);
}

Unsigned128 Hash128(const char* data, size_t n, uint64_t seed) {
  auto h = XXH3_128bits_withSeed(data, n, seed);
  return (Unsigned128{h.high64} << 64) | (h.low64);
}

Unsigned128 Hash128(const char* data, size_t n) {
  // Same as seed = 0
  auto h = XXH3_128bits(data, n);
  return (Unsigned128{h.high64} << 64) | (h.low64);
}

void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64) {
  // Same as seed = 0
  auto h = XXH3_128bits(data, n);
  *high64 = h.high64;
  *low64 = h.low64;
}

void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64,
              uint64_t* low64) {
  auto h = XXH3_128bits_withSeed(data, n, seed);
  *high64 = h.high64;
  *low64 = h.low64;
}

namespace {

inline uint64_t XXH3_avalanche(uint64_t h64) {
  h64 ^= h64 >> 37;
  h64 *= 0x165667919E3779F9U;
  h64 ^= h64 >> 32;
  return h64;
}

inline uint64_t XXH3_unavalanche(uint64_t h64) {
  h64 ^= h64 >> 32;
  h64 *= 0x8da8ee41d6df849U;  // inverse of 0x165667919E3779F9U
  h64 ^= h64 >> 37;
  return h64;
}

}  // namespace

void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
                       uint64_t* out_high64, uint64_t* out_low64) {
  // Adapted from XXH3_len_9to16_128b
  const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed;
  const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed;
  Unsigned128 tmp128 =
      Multiply64to128(in_low64 ^ in_high64 ^ bitflipl, 0x9E3779B185EBCA87U);
  uint64_t lo = Lower64of128(tmp128);
  uint64_t hi = Upper64of128(tmp128);
  lo += 0x3c0000000000000U;  // (len - 1) << 54
  in_high64 ^= bitfliph;
  hi += in_high64 + (Lower32of64(in_high64) * uint64_t{0x85EBCA76});
  lo ^= EndianSwapValue(hi);
  tmp128 = Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU);
  lo = Lower64of128(tmp128);
  hi = Upper64of128(tmp128) + (hi * 0xC2B2AE3D27D4EB4FU);
  *out_low64 = XXH3_avalanche(lo);
  *out_high64 = XXH3_avalanche(hi);
}

void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
                         uint64_t* out_high64, uint64_t* out_low64) {
  // Inverted above (also consulting XXH3_len_9to16_128b)
  const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed;
  const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed;
  uint64_t lo = XXH3_unavalanche(in_low64);
  uint64_t hi = XXH3_unavalanche(in_high64);
  lo *= 0xba79078168d4baf;  // inverse of 0xC2B2AE3D27D4EB4FU
  hi -= Upper64of128(Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU));
  hi *= 0xba79078168d4baf;  // inverse of 0xC2B2AE3D27D4EB4FU
  lo ^= EndianSwapValue(hi);
  lo -= 0x3c0000000000000U;
  lo *= 0x887493432badb37U;  // inverse of 0x9E3779B185EBCA87U
  hi -= Upper64of128(Multiply64to128(lo, 0x9E3779B185EBCA87U));
  uint32_t tmp32 = Lower32of64(hi) * 0xb6c92f47;  // inverse of 0x85EBCA77
  hi -= tmp32;
  hi = (hi & 0xFFFFFFFF00000000U) -
       ((tmp32 * uint64_t{0x85EBCA76}) & 0xFFFFFFFF00000000U) + tmp32;
  hi ^= bitfliph;
  lo ^= hi ^ bitflipl;
  *out_high64 = hi;
  *out_low64 = lo;
}

void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64,
                       uint64_t* out_high64, uint64_t* out_low64) {
  BijectiveHash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64);
}

void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64,
                         uint64_t* out_high64, uint64_t* out_low64) {
  BijectiveUnhash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64);
}
}  // namespace ROCKSDB_NAMESPACE

Coverage Report

Created: 2024-07-27 06:53

Line	Count	Source (jump to first uncovered line)
1		// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2		// This source code is licensed under both the GPLv2 (found in the
3		// COPYING file in the root directory) and Apache 2.0 License
4		// (found in the LICENSE.Apache file in the root directory).
5		//
6		// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7		// Use of this source code is governed by a BSD-style license that can be
8		// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10		#include "util/hash.h"
11
12		#include <string>
13
14		#include "port/lang.h"
15		#include "util/coding.h"
16		#include "util/hash128.h"
17		#include "util/math128.h"
18		#include "util/xxhash.h"
19		#include "util/xxph3.h"
20
21		namespace ROCKSDB_NAMESPACE {
22
23		uint64_t (*kGetSliceNPHash64UnseededFnPtr)(const Slice&) = &GetSliceHash64;
24
25	404k	uint32_t Hash(const char* data, size_t n, uint32_t seed) {
26		// MurmurHash1 - fast but mediocre quality
27		// https://github.com/aappleby/smhasher/wiki/MurmurHash1
28		//
29	404k	const uint32_t m = 0xc6a4a793;
30	404k	const uint32_t r = 24;
31	404k	const char* limit = data + n;
32	404k	uint32_t h = static_cast<uint32_t>(seed ^ (n * m));
33
34		// Pick up four bytes at a time
35	1.61M	while (data + 4 <= limit) {
36	1.21M	uint32_t w = DecodeFixed32(data);
37	1.21M	data += 4;
38	1.21M	h += w;
39	1.21M	h *= m;
40	1.21M	h ^= (h >> 16);
41	1.21M	}
42
43		// Pick up remaining bytes
44	404k	switch (limit - data) {
45		// Note: The original hash implementation used data[i] << shift, which
46		// promotes the char to int and then performs the shift. If the char is
47		// negative, the shift is undefined behavior in C++. The hash algorithm is
48		// part of the format definition, so we cannot change it; to obtain the same
49		// behavior in a legal way we just cast to uint32_t, which will do
50		// sign-extension. To guarantee compatibility with architectures where chars
51		// are unsigned we first cast the char to int8_t.
52	0	case 3:
53	0	h += static_cast<uint32_t>(static_cast<int8_t>(data[2])) << 16;
54	0	FALLTHROUGH_INTENDED;
55	0	case 2:
56	0	h += static_cast<uint32_t>(static_cast<int8_t>(data[1])) << 8;
57	0	FALLTHROUGH_INTENDED;
58	0	case 1:
59	0	h += static_cast<uint32_t>(static_cast<int8_t>(data[0]));
60	0	h *= m;
61	0	h ^= (h >> r);
62	0	break;
63	404k	}
64	404k	return h;
65	404k	}
66
67		// We are standardizing on a preview release of XXH3, because that's
68		// the best available at time of standardizing.
69		//
70		// In testing (mostly Intel Skylake), this hash function is much more
71		// thorough than Hash32 and is almost universally faster. Hash() only
72		// seems faster when passing runtime-sized keys of the same small size
73		// (less than about 24 bytes) thousands of times in a row; this seems
74		// to allow the branch predictor to work some magic. XXH3's speed is
75		// much less dependent on branch prediction.
76		//
77		// Hashing with a prefix extractor is potentially a common case of
78		// hashing objects of small, predictable size. We could consider
79		// bundling hash functions specialized for particular lengths with
80		// the prefix extractors.
81	13.8M	uint64_t Hash64(const char* data, size_t n, uint64_t seed) {
82	13.8M	return XXPH3_64bits_withSeed(data, n, seed);
83	13.8M	}
84
85	0	uint64_t Hash64(const char* data, size_t n) {
86		// Same as seed = 0
87	0	return XXPH3_64bits(data, n);
88	0	}
89
90	0	uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed) {
91		// TODO(ajkr): use XXH3 streaming APIs to avoid the copy/allocation.
92	0	size_t concat_len = 0;
93	0	for (int i = 0; i < data.num_parts; ++i) {
94	0	concat_len += data.parts[i].size();
95	0	}
96	0	std::string concat_data;
97	0	concat_data.reserve(concat_len);
98	0	for (int i = 0; i < data.num_parts; ++i) {
99	0	concat_data.append(data.parts[i].data(), data.parts[i].size());
100	0	}
101	0	assert(concat_data.size() == concat_len);
102	0	return NPHash64(concat_data.data(), concat_len, seed);
103	0	}
104
105	0	Unsigned128 Hash128(const char* data, size_t n, uint64_t seed) {
106	0	auto h = XXH3_128bits_withSeed(data, n, seed);
107	0	return (Unsigned128{h.high64} << 64) \| (h.low64);
108	0	}
109
110	0	Unsigned128 Hash128(const char* data, size_t n) {
111		// Same as seed = 0
112	0	auto h = XXH3_128bits(data, n);
113	0	return (Unsigned128{h.high64} << 64) \| (h.low64);
114	0	}
115
116	4	void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64) {
117		// Same as seed = 0
118	4	auto h = XXH3_128bits(data, n);
119	4	*high64 = h.high64;
120	4	*low64 = h.low64;
121	4	}
122
123		void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64,
124	82.7k	uint64_t* low64) {
125	82.7k	auto h = XXH3_128bits_withSeed(data, n, seed);
126	82.7k	*high64 = h.high64;
127	82.7k	*low64 = h.low64;
128	82.7k	}
129
130		namespace {
131
132	0	inline uint64_t XXH3_avalanche(uint64_t h64) {
133	0	h64 ^= h64 >> 37;
134	0	h64 *= 0x165667919E3779F9U;
135	0	h64 ^= h64 >> 32;
136	0	return h64;
137	0	}
138
139	0	inline uint64_t XXH3_unavalanche(uint64_t h64) {
140	0	h64 ^= h64 >> 32;
141	0	h64 *= 0x8da8ee41d6df849U; // inverse of 0x165667919E3779F9U
142	0	h64 ^= h64 >> 37;
143	0	return h64;
144	0	}
145
146		} // namespace
147
148		void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
149	0	uint64_t* out_high64, uint64_t* out_low64) {
150		// Adapted from XXH3_len_9to16_128b
151	0	const uint64_t bitflipl = /secret part/ 0x59973f0033362349U - seed;
152	0	const uint64_t bitfliph = /secret part/ 0xc202797692d63d58U + seed;
153	0	Unsigned128 tmp128 =
154	0	Multiply64to128(in_low64 ^ in_high64 ^ bitflipl, 0x9E3779B185EBCA87U);
155	0	uint64_t lo = Lower64of128(tmp128);
156	0	uint64_t hi = Upper64of128(tmp128);
157	0	lo += 0x3c0000000000000U; // (len - 1) << 54
158	0	in_high64 ^= bitfliph;
159	0	hi += in_high64 + (Lower32of64(in_high64) * uint64_t{0x85EBCA76});
160	0	lo ^= EndianSwapValue(hi);
161	0	tmp128 = Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU);
162	0	lo = Lower64of128(tmp128);
163	0	hi = Upper64of128(tmp128) + (hi * 0xC2B2AE3D27D4EB4FU);
164	0	*out_low64 = XXH3_avalanche(lo);
165	0	*out_high64 = XXH3_avalanche(hi);
166	0	}
167
168		void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed,
169	0	uint64_t* out_high64, uint64_t* out_low64) {
170		// Inverted above (also consulting XXH3_len_9to16_128b)
171	0	const uint64_t bitflipl = /secret part/ 0x59973f0033362349U - seed;
172	0	const uint64_t bitfliph = /secret part/ 0xc202797692d63d58U + seed;
173	0	uint64_t lo = XXH3_unavalanche(in_low64);
174	0	uint64_t hi = XXH3_unavalanche(in_high64);
175	0	lo *= 0xba79078168d4baf; // inverse of 0xC2B2AE3D27D4EB4FU
176	0	hi -= Upper64of128(Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU));
177	0	hi *= 0xba79078168d4baf; // inverse of 0xC2B2AE3D27D4EB4FU
178	0	lo ^= EndianSwapValue(hi);
179	0	lo -= 0x3c0000000000000U;
180	0	lo *= 0x887493432badb37U; // inverse of 0x9E3779B185EBCA87U
181	0	hi -= Upper64of128(Multiply64to128(lo, 0x9E3779B185EBCA87U));
182	0	uint32_t tmp32 = Lower32of64(hi) * 0xb6c92f47; // inverse of 0x85EBCA77
183	0	hi -= tmp32;
184	0	hi = (hi & 0xFFFFFFFF00000000U) -
185	0	((tmp32 * uint64_t{0x85EBCA76}) & 0xFFFFFFFF00000000U) + tmp32;
186	0	hi ^= bitfliph;
187	0	lo ^= hi ^ bitflipl;
188	0	*out_high64 = hi;
189	0	*out_low64 = lo;
190	0	}
191
192		void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64,
193	0	uint64_t* out_high64, uint64_t* out_low64) {
194	0	BijectiveHash2x64(in_high64, in_low64, /seed/ 0, out_high64, out_low64);
195	0	}
196
197		void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64,
198	0	uint64_t* out_high64, uint64_t* out_low64) {
199	0	BijectiveUnhash2x64(in_high64, in_low64, /seed/ 0, out_high64, out_low64);
200	0	}
201		} // namespace ROCKSDB_NAMESPACE