/src/rocksdb/util/hash.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | // |
6 | | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | | // Use of this source code is governed by a BSD-style license that can be |
8 | | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | | |
10 | | #include "util/hash.h" |
11 | | |
12 | | #include <string> |
13 | | |
14 | | #include "port/lang.h" |
15 | | #include "util/coding.h" |
16 | | #include "util/hash128.h" |
17 | | #include "util/math128.h" |
18 | | #include "util/xxhash.h" |
19 | | #include "util/xxph3.h" |
20 | | |
21 | | namespace ROCKSDB_NAMESPACE { |
22 | | |
23 | | uint64_t (*kGetSliceNPHash64UnseededFnPtr)(const Slice&) = &GetSliceHash64; |
24 | | |
25 | 404k | uint32_t Hash(const char* data, size_t n, uint32_t seed) { |
26 | | // MurmurHash1 - fast but mediocre quality |
27 | | // https://github.com/aappleby/smhasher/wiki/MurmurHash1 |
28 | | // |
29 | 404k | const uint32_t m = 0xc6a4a793; |
30 | 404k | const uint32_t r = 24; |
31 | 404k | const char* limit = data + n; |
32 | 404k | uint32_t h = static_cast<uint32_t>(seed ^ (n * m)); |
33 | | |
34 | | // Pick up four bytes at a time |
35 | 1.61M | while (data + 4 <= limit) { |
36 | 1.21M | uint32_t w = DecodeFixed32(data); |
37 | 1.21M | data += 4; |
38 | 1.21M | h += w; |
39 | 1.21M | h *= m; |
40 | 1.21M | h ^= (h >> 16); |
41 | 1.21M | } |
42 | | |
43 | | // Pick up remaining bytes |
44 | 404k | switch (limit - data) { |
45 | | // Note: The original hash implementation used data[i] << shift, which |
46 | | // promotes the char to int and then performs the shift. If the char is |
47 | | // negative, the shift is undefined behavior in C++. The hash algorithm is |
48 | | // part of the format definition, so we cannot change it; to obtain the same |
49 | | // behavior in a legal way we just cast to uint32_t, which will do |
50 | | // sign-extension. To guarantee compatibility with architectures where chars |
51 | | // are unsigned we first cast the char to int8_t. |
52 | 0 | case 3: |
53 | 0 | h += static_cast<uint32_t>(static_cast<int8_t>(data[2])) << 16; |
54 | 0 | FALLTHROUGH_INTENDED; |
55 | 0 | case 2: |
56 | 0 | h += static_cast<uint32_t>(static_cast<int8_t>(data[1])) << 8; |
57 | 0 | FALLTHROUGH_INTENDED; |
58 | 0 | case 1: |
59 | 0 | h += static_cast<uint32_t>(static_cast<int8_t>(data[0])); |
60 | 0 | h *= m; |
61 | 0 | h ^= (h >> r); |
62 | 0 | break; |
63 | 404k | } |
64 | 404k | return h; |
65 | 404k | } |
66 | | |
67 | | // We are standardizing on a preview release of XXH3, because that's |
68 | | // the best available at time of standardizing. |
69 | | // |
70 | | // In testing (mostly Intel Skylake), this hash function is much more |
71 | | // thorough than Hash32 and is almost universally faster. Hash() only |
72 | | // seems faster when passing runtime-sized keys of the same small size |
73 | | // (less than about 24 bytes) thousands of times in a row; this seems |
74 | | // to allow the branch predictor to work some magic. XXH3's speed is |
75 | | // much less dependent on branch prediction. |
76 | | // |
77 | | // Hashing with a prefix extractor is potentially a common case of |
78 | | // hashing objects of small, predictable size. We could consider |
79 | | // bundling hash functions specialized for particular lengths with |
80 | | // the prefix extractors. |
81 | 13.8M | uint64_t Hash64(const char* data, size_t n, uint64_t seed) { |
82 | 13.8M | return XXPH3_64bits_withSeed(data, n, seed); |
83 | 13.8M | } |
84 | | |
85 | 0 | uint64_t Hash64(const char* data, size_t n) { |
86 | | // Same as seed = 0 |
87 | 0 | return XXPH3_64bits(data, n); |
88 | 0 | } |
89 | | |
90 | 0 | uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed) { |
91 | | // TODO(ajkr): use XXH3 streaming APIs to avoid the copy/allocation. |
92 | 0 | size_t concat_len = 0; |
93 | 0 | for (int i = 0; i < data.num_parts; ++i) { |
94 | 0 | concat_len += data.parts[i].size(); |
95 | 0 | } |
96 | 0 | std::string concat_data; |
97 | 0 | concat_data.reserve(concat_len); |
98 | 0 | for (int i = 0; i < data.num_parts; ++i) { |
99 | 0 | concat_data.append(data.parts[i].data(), data.parts[i].size()); |
100 | 0 | } |
101 | 0 | assert(concat_data.size() == concat_len); |
102 | 0 | return NPHash64(concat_data.data(), concat_len, seed); |
103 | 0 | } |
104 | | |
105 | 0 | Unsigned128 Hash128(const char* data, size_t n, uint64_t seed) { |
106 | 0 | auto h = XXH3_128bits_withSeed(data, n, seed); |
107 | 0 | return (Unsigned128{h.high64} << 64) | (h.low64); |
108 | 0 | } |
109 | | |
110 | 0 | Unsigned128 Hash128(const char* data, size_t n) { |
111 | | // Same as seed = 0 |
112 | 0 | auto h = XXH3_128bits(data, n); |
113 | 0 | return (Unsigned128{h.high64} << 64) | (h.low64); |
114 | 0 | } |
115 | | |
116 | 4 | void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64) { |
117 | | // Same as seed = 0 |
118 | 4 | auto h = XXH3_128bits(data, n); |
119 | 4 | *high64 = h.high64; |
120 | 4 | *low64 = h.low64; |
121 | 4 | } |
122 | | |
123 | | void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64, |
124 | 82.7k | uint64_t* low64) { |
125 | 82.7k | auto h = XXH3_128bits_withSeed(data, n, seed); |
126 | 82.7k | *high64 = h.high64; |
127 | 82.7k | *low64 = h.low64; |
128 | 82.7k | } |
129 | | |
130 | | namespace { |
131 | | |
132 | 0 | inline uint64_t XXH3_avalanche(uint64_t h64) { |
133 | 0 | h64 ^= h64 >> 37; |
134 | 0 | h64 *= 0x165667919E3779F9U; |
135 | 0 | h64 ^= h64 >> 32; |
136 | 0 | return h64; |
137 | 0 | } |
138 | | |
139 | 0 | inline uint64_t XXH3_unavalanche(uint64_t h64) { |
140 | 0 | h64 ^= h64 >> 32; |
141 | 0 | h64 *= 0x8da8ee41d6df849U; // inverse of 0x165667919E3779F9U |
142 | 0 | h64 ^= h64 >> 37; |
143 | 0 | return h64; |
144 | 0 | } |
145 | | |
146 | | } // namespace |
147 | | |
148 | | void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed, |
149 | 0 | uint64_t* out_high64, uint64_t* out_low64) { |
150 | | // Adapted from XXH3_len_9to16_128b |
151 | 0 | const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed; |
152 | 0 | const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed; |
153 | 0 | Unsigned128 tmp128 = |
154 | 0 | Multiply64to128(in_low64 ^ in_high64 ^ bitflipl, 0x9E3779B185EBCA87U); |
155 | 0 | uint64_t lo = Lower64of128(tmp128); |
156 | 0 | uint64_t hi = Upper64of128(tmp128); |
157 | 0 | lo += 0x3c0000000000000U; // (len - 1) << 54 |
158 | 0 | in_high64 ^= bitfliph; |
159 | 0 | hi += in_high64 + (Lower32of64(in_high64) * uint64_t{0x85EBCA76}); |
160 | 0 | lo ^= EndianSwapValue(hi); |
161 | 0 | tmp128 = Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU); |
162 | 0 | lo = Lower64of128(tmp128); |
163 | 0 | hi = Upper64of128(tmp128) + (hi * 0xC2B2AE3D27D4EB4FU); |
164 | 0 | *out_low64 = XXH3_avalanche(lo); |
165 | 0 | *out_high64 = XXH3_avalanche(hi); |
166 | 0 | } |
167 | | |
168 | | void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed, |
169 | 0 | uint64_t* out_high64, uint64_t* out_low64) { |
170 | | // Inverted above (also consulting XXH3_len_9to16_128b) |
171 | 0 | const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed; |
172 | 0 | const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed; |
173 | 0 | uint64_t lo = XXH3_unavalanche(in_low64); |
174 | 0 | uint64_t hi = XXH3_unavalanche(in_high64); |
175 | 0 | lo *= 0xba79078168d4baf; // inverse of 0xC2B2AE3D27D4EB4FU |
176 | 0 | hi -= Upper64of128(Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU)); |
177 | 0 | hi *= 0xba79078168d4baf; // inverse of 0xC2B2AE3D27D4EB4FU |
178 | 0 | lo ^= EndianSwapValue(hi); |
179 | 0 | lo -= 0x3c0000000000000U; |
180 | 0 | lo *= 0x887493432badb37U; // inverse of 0x9E3779B185EBCA87U |
181 | 0 | hi -= Upper64of128(Multiply64to128(lo, 0x9E3779B185EBCA87U)); |
182 | 0 | uint32_t tmp32 = Lower32of64(hi) * 0xb6c92f47; // inverse of 0x85EBCA77 |
183 | 0 | hi -= tmp32; |
184 | 0 | hi = (hi & 0xFFFFFFFF00000000U) - |
185 | 0 | ((tmp32 * uint64_t{0x85EBCA76}) & 0xFFFFFFFF00000000U) + tmp32; |
186 | 0 | hi ^= bitfliph; |
187 | 0 | lo ^= hi ^ bitflipl; |
188 | 0 | *out_high64 = hi; |
189 | 0 | *out_low64 = lo; |
190 | 0 | } |
191 | | |
192 | | void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, |
193 | 0 | uint64_t* out_high64, uint64_t* out_low64) { |
194 | 0 | BijectiveHash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64); |
195 | 0 | } |
196 | | |
197 | | void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, |
198 | 0 | uint64_t* out_high64, uint64_t* out_low64) { |
199 | 0 | BijectiveUnhash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64); |
200 | 0 | } |
201 | | } // namespace ROCKSDB_NAMESPACE |