/src/rocksdb/util/xxph3.h
Line | Count | Source |
1 | | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. |
2 | | // This source code is licensed under both the GPLv2 (found in the |
3 | | // COPYING file in the root directory) and Apache 2.0 License |
4 | | // (found in the LICENSE.Apache file in the root directory). |
5 | | /* |
6 | | xxHash - Extremely Fast Hash algorithm |
7 | | Header File |
8 | | Copyright (C) 2012-2016, Yann Collet. |
9 | | |
10 | | BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) |
11 | | |
12 | | Redistribution and use in source and binary forms, with or without |
13 | | modification, are permitted provided that the following conditions are |
14 | | met: |
15 | | |
16 | | * Redistributions of source code must retain the above copyright |
17 | | notice, this list of conditions and the following disclaimer. |
18 | | * Redistributions in binary form must reproduce the above |
19 | | copyright notice, this list of conditions and the following disclaimer |
20 | | in the documentation and/or other materials provided with the |
21 | | distribution. |
22 | | |
23 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
24 | | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
25 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
26 | | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
27 | | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
28 | | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
29 | | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
30 | | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
31 | | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
32 | | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
33 | | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
34 | | |
35 | | You can contact the author at : |
36 | | - xxHash source repository : https://github.com/Cyan4973/xxHash |
37 | | */ |
38 | | |
39 | | // This is a fork of a preview version of xxHash, as RocksDB depends on |
40 | | // this preview version of XXH3. To allow this to coexist with the |
41 | | // standard xxHash, including in the "unity" build where all source files |
42 | | // and headers go into a single translation unit, here "XXH" has been |
43 | | // replaced with "XXPH" for XX Preview Hash. |
44 | | |
45 | | #ifndef XXPHASH_H_5627135585666179 |
46 | | #define XXPHASH_H_5627135585666179 1 |
47 | | |
48 | | /* BEGIN RocksDB customizations */ |
49 | | #ifndef XXPH_STATIC_LINKING_ONLY |
50 | | // Access experimental APIs |
51 | | #define XXPH_STATIC_LINKING_ONLY 1 |
52 | | #endif |
53 | | #define XXPH_NAMESPACE ROCKSDB_ |
54 | | #define XXPH_INLINE_ALL |
55 | | #include <cstring> |
56 | | /* END RocksDB customizations */ |
57 | | |
58 | | // clang-format off |
59 | | #if defined (__cplusplus) |
60 | | extern "C" { |
61 | | #endif |
62 | | |
63 | | |
64 | | /* **************************** |
65 | | * Definitions |
66 | | ******************************/ |
67 | | #include <stddef.h> /* size_t */ |
68 | | typedef enum { XXPH_OK=0, XXPH_ERROR } XXPH_errorcode; |
69 | | |
70 | | |
71 | | /* **************************** |
72 | | * API modifier |
73 | | ******************************/ |
74 | | /** XXPH_INLINE_ALL (and XXPH_PRIVATE_API) |
75 | | * This build macro includes xxhash functions in `static` mode |
76 | | * in order to inline them, and remove their symbol from the public list. |
77 | | * Inlining offers great performance improvement on small keys, |
78 | | * and dramatic ones when length is expressed as a compile-time constant. |
79 | | * See https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html . |
80 | | * Methodology : |
81 | | * #define XXPH_INLINE_ALL |
82 | | * #include "xxhash.h" |
83 | | * `xxhash.c` is automatically included. |
84 | | * It's not useful to compile and link it as a separate object. |
85 | | */ |
86 | | #if defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API) |
87 | | # ifndef XXPH_STATIC_LINKING_ONLY |
88 | | # define XXPH_STATIC_LINKING_ONLY |
89 | | # endif |
90 | | # if defined(__GNUC__) |
91 | | # define XXPH_PUBLIC_API static __inline __attribute__((unused)) |
92 | | # elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) |
93 | | # define XXPH_PUBLIC_API static inline |
94 | | # elif defined(_MSC_VER) |
95 | | # define XXPH_PUBLIC_API static __inline |
96 | | # else |
97 | | /* this version may generate warnings for unused static functions */ |
98 | | # define XXPH_PUBLIC_API static |
99 | | # endif |
100 | | #else |
101 | | # if defined(WIN32) && defined(_MSC_VER) && (defined(XXPH_IMPORT) || defined(XXPH_EXPORT)) |
102 | | # ifdef XXPH_EXPORT |
103 | | # define XXPH_PUBLIC_API __declspec(dllexport) |
104 | | # elif XXPH_IMPORT |
105 | | # define XXPH_PUBLIC_API __declspec(dllimport) |
106 | | # endif |
107 | | # else |
108 | | # define XXPH_PUBLIC_API /* do nothing */ |
109 | | # endif |
110 | | #endif /* XXPH_INLINE_ALL || XXPH_PRIVATE_API */ |
111 | | |
112 | | /*! XXPH_NAMESPACE, aka Namespace Emulation : |
113 | | * |
114 | | * If you want to include _and expose_ xxHash functions from within your own library, |
115 | | * but also want to avoid symbol collisions with other libraries which may also include xxHash, |
116 | | * |
117 | | * you can use XXPH_NAMESPACE, to automatically prefix any public symbol from xxhash library |
118 | | * with the value of XXPH_NAMESPACE (therefore, avoid NULL and numeric values). |
119 | | * |
120 | | * Note that no change is required within the calling program as long as it includes `xxhash.h` : |
121 | | * regular symbol name will be automatically translated by this header. |
122 | | */ |
123 | | #ifdef XXPH_NAMESPACE |
124 | 10.2M | # define XXPH_CAT(A,B) A##B |
125 | 10.2M | # define XXPH_NAME2(A,B) XXPH_CAT(A,B) |
126 | | # define XXPH_versionNumber XXPH_NAME2(XXPH_NAMESPACE, XXPH_versionNumber) |
127 | | #endif |
128 | | |
129 | | |
130 | | /* ************************************* |
131 | | * Version |
132 | | ***************************************/ |
133 | | #define XXPH_VERSION_MAJOR 0 |
134 | | #define XXPH_VERSION_MINOR 7 |
135 | | #define XXPH_VERSION_RELEASE 2 |
136 | | #define XXPH_VERSION_NUMBER (XXPH_VERSION_MAJOR *100*100 + XXPH_VERSION_MINOR *100 + XXPH_VERSION_RELEASE) |
137 | | XXPH_PUBLIC_API unsigned XXPH_versionNumber (void); |
138 | | |
139 | | |
140 | | /*-********************************************************************** |
141 | | * 32-bit hash |
142 | | ************************************************************************/ |
143 | | #if !defined (__VMS) \ |
144 | | && (defined (__cplusplus) \ |
145 | | || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) |
146 | | # include <stdint.h> |
147 | | typedef uint32_t XXPH32_hash_t; |
148 | | #else |
149 | | # include <limits.h> |
150 | | # if UINT_MAX == 0xFFFFFFFFUL |
151 | | typedef unsigned int XXPH32_hash_t; |
152 | | # else |
153 | | # if ULONG_MAX == 0xFFFFFFFFUL |
154 | | typedef unsigned long XXPH32_hash_t; |
155 | | # else |
156 | | # error "unsupported platform : need a 32-bit type" |
157 | | # endif |
158 | | # endif |
159 | | #endif |
160 | | |
161 | | #ifndef XXPH_NO_LONG_LONG |
162 | | /*-********************************************************************** |
163 | | * 64-bit hash |
164 | | ************************************************************************/ |
165 | | #if !defined (__VMS) \ |
166 | | && (defined (__cplusplus) \ |
167 | | || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) |
168 | | # include <stdint.h> |
169 | | typedef uint64_t XXPH64_hash_t; |
170 | | #else |
171 | | /* the following type must have a width of 64-bit */ |
172 | | typedef unsigned long long XXPH64_hash_t; |
173 | | #endif |
174 | | |
175 | | #endif /* XXPH_NO_LONG_LONG */ |
176 | | |
177 | | |
178 | | |
179 | | #ifdef XXPH_STATIC_LINKING_ONLY |
180 | | |
181 | | /* ================================================================================================ |
182 | | This section contains declarations which are not guaranteed to remain stable. |
183 | | They may change in future versions, becoming incompatible with a different version of the library. |
184 | | These declarations should only be used with static linking. |
185 | | Never use them in association with dynamic linking ! |
186 | | =================================================================================================== */ |
187 | | |
188 | | |
189 | | /*-********************************************************************** |
190 | | * XXPH3 |
191 | | * New experimental hash |
192 | | ************************************************************************/ |
193 | | #ifndef XXPH_NO_LONG_LONG |
194 | | |
195 | | |
196 | | /* ============================================ |
197 | | * XXPH3 is a new hash algorithm, |
198 | | * featuring improved speed performance for both small and large inputs. |
199 | | * See full speed analysis at : http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html |
200 | | * In general, expect XXPH3 to run about ~2x faster on large inputs, |
201 | | * and >3x faster on small ones, though exact differences depend on platform. |
202 | | * |
203 | | * The algorithm is portable, will generate the same hash on all platforms. |
204 | | * It benefits greatly from vectorization units, but does not require it. |
205 | | * |
206 | | * XXPH3 offers 2 variants, _64bits and _128bits. |
207 | | * When only 64 bits are needed, prefer calling the _64bits variant : |
208 | | * it reduces the amount of mixing, resulting in faster speed on small inputs. |
209 | | * It's also generally simpler to manipulate a scalar return type than a struct. |
210 | | * |
211 | | * The XXPH3 algorithm is still considered experimental. |
212 | | * Produced results can still change between versions. |
213 | | * Results produced by v0.7.x are not comparable with results from v0.7.y . |
214 | | * It's nonetheless possible to use XXPH3 for ephemeral data (local sessions), |
215 | | * but avoid storing values in long-term storage for later reads. |
216 | | * |
217 | | * The API supports one-shot hashing, streaming mode, and custom secrets. |
218 | | * |
219 | | * There are still a number of opened questions that community can influence during the experimental period. |
220 | | * I'm trying to list a few of them below, though don't consider this list as complete. |
221 | | * |
222 | | * - 128-bits output type : currently defined as a structure of two 64-bits fields. |
223 | | * That's because 128-bit values do not exist in C standard. |
224 | | * Note that it means that, at byte level, result is not identical depending on endianess. |
225 | | * However, at field level, they are identical on all platforms. |
226 | | * The canonical representation solves the issue of identical byte-level representation across platforms, |
227 | | * which is necessary for serialization. |
228 | | * Q1 : Would there be a better representation for a 128-bit hash result ? |
229 | | * Q2 : Are the names of the inner 64-bit fields important ? Should they be changed ? |
230 | | * |
231 | | * - Prototype XXPH128() : XXPH128() uses the same arguments as XXPH64(), for consistency. |
232 | | * It means it maps to XXPH3_128bits_withSeed(). |
233 | | * This variant is slightly slower than XXPH3_128bits(), |
234 | | * because the seed is now part of the algorithm, and can't be simplified. |
235 | | * Is that a good idea ? |
236 | | * |
237 | | * - Seed type for XXPH128() : currently, it's a single 64-bit value, like the 64-bit variant. |
238 | | * It could be argued that it's more logical to offer a 128-bit seed input parameter for a 128-bit hash. |
239 | | * But 128-bit seed is more difficult to use, since it requires to pass a structure instead of a scalar value. |
240 | | * Such a variant could either replace current one, or become an additional one. |
241 | | * Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`). |
242 | | * Follow up question : if both 64-bit and 128-bit seeds are allowed, which variant should be called XXPH128 ? |
243 | | * |
244 | | * - Result for len==0 : Currently, the result of hashing a zero-length input is always `0`. |
245 | | * It seems okay as a return value when using "default" secret and seed. |
246 | | * But is it still fine to return `0` when secret or seed are non-default ? |
247 | | * Are there use cases which could depend on generating a different hash result for zero-length input when the secret is different ? |
248 | | * |
249 | | * - Consistency (1) : Streaming XXPH128 uses an XXPH3 state, which is the same state as XXPH3_64bits(). |
250 | | * It means a 128bit streaming loop must invoke the following symbols : |
251 | | * XXPH3_createState(), XXPH3_128bits_reset(), XXPH3_128bits_update() (loop), XXPH3_128bits_digest(), XXPH3_freeState(). |
252 | | * Is that consistent enough ? |
253 | | * |
254 | | * - Consistency (2) : The canonical representation of `XXPH3_64bits` is provided by existing functions |
255 | | * XXPH64_canonicalFromHash(), and reverse operation XXPH64_hashFromCanonical(). |
256 | | * As a mirror, canonical functions for XXPH128_hash_t results generated by `XXPH3_128bits` |
257 | | * are XXPH128_canonicalFromHash() and XXPH128_hashFromCanonical(). |
258 | | * Which means, `XXPH3` doesn't appear in the names, because canonical functions operate on a type, |
259 | | * independently of which algorithm was used to generate that type. |
260 | | * Is that consistent enough ? |
261 | | */ |
262 | | |
263 | | #ifdef XXPH_NAMESPACE |
264 | 0 | # define XXPH3_64bits XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits) |
265 | | # define XXPH3_64bits_withSecret XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits_withSecret) |
266 | 10.2M | # define XXPH3_64bits_withSeed XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits_withSeed) |
267 | | #endif |
268 | | |
269 | | /* XXPH3_64bits() : |
270 | | * default 64-bit variant, using default secret and default seed of 0. |
271 | | * It's the fastest variant. */ |
272 | | XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits(const void* data, size_t len); |
273 | | |
274 | | /* XXPH3_64bits_withSecret() : |
275 | | * It's possible to provide any blob of bytes as a "secret" to generate the hash. |
276 | | * This makes it more difficult for an external actor to prepare an intentional collision. |
277 | | * The secret *must* be large enough (>= XXPH3_SECRET_SIZE_MIN). |
278 | | * It should consist of random bytes. |
279 | | * Avoid repeating same character, or sequences of bytes, |
280 | | * and especially avoid swathes of \0. |
281 | | * Failure to respect these conditions will result in a poor quality hash. |
282 | | */ |
283 | 152k | #define XXPH3_SECRET_SIZE_MIN 136 |
284 | | XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); |
285 | | |
286 | | /* XXPH3_64bits_withSeed() : |
287 | | * This variant generates on the fly a custom secret, |
288 | | * based on the default secret, altered using the `seed` value. |
289 | | * While this operation is decently fast, note that it's not completely free. |
290 | | * note : seed==0 produces same results as XXPH3_64bits() */ |
291 | | XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits_withSeed(const void* data, size_t len, XXPH64_hash_t seed); |
292 | | |
293 | | #if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ |
294 | | # include <stdalign.h> |
295 | | # define XXPH_ALIGN(n) alignas(n) |
296 | | #elif defined(__GNUC__) |
297 | 6.23M | # define XXPH_ALIGN(n) __attribute__ ((aligned(n))) |
298 | | #elif defined(_MSC_VER) |
299 | | # define XXPH_ALIGN(n) __declspec(align(n)) |
300 | | #else |
301 | | # define XXPH_ALIGN(n) /* disabled */ |
302 | | #endif |
303 | | |
304 | | #define XXPH3_SECRET_DEFAULT_SIZE 192 /* minimum XXPH3_SECRET_SIZE_MIN */ |
305 | | |
306 | | #endif /* XXPH_NO_LONG_LONG */ |
307 | | |
308 | | |
309 | | /*-********************************************************************** |
310 | | * XXPH_INLINE_ALL |
311 | | ************************************************************************/ |
312 | | #if defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API) |
313 | | |
314 | | /* === RocksDB modification: was #include here but permanently inlining === */ |
315 | | |
316 | | typedef struct { |
317 | | XXPH64_hash_t low64; |
318 | | XXPH64_hash_t high64; |
319 | | } XXPH128_hash_t; |
320 | | |
321 | | /* ************************************* |
322 | | * Tuning parameters |
323 | | ***************************************/ |
324 | | /*!XXPH_FORCE_MEMORY_ACCESS : |
325 | | * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. |
326 | | * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. |
327 | | * The below switch allow to select different access method for improved performance. |
328 | | * Method 0 (default) : use `memcpy()`. Safe and portable. |
329 | | * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). |
330 | | * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. |
331 | | * Method 2 : direct access. This method doesn't depend on compiler but violate C standard. |
332 | | * It can generate buggy code on targets which do not support unaligned memory accesses. |
333 | | * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) |
334 | | * See http://stackoverflow.com/a/32095106/646947 for details. |
335 | | * Prefer these methods in priority order (0 > 1 > 2) |
336 | | */ |
337 | | #ifndef XXPH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ |
338 | | # if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6) |
339 | | # define XXPH_FORCE_MEMORY_ACCESS 2 |
340 | | # elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ |
341 | | (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7))) |
342 | | # define XXPH_FORCE_MEMORY_ACCESS 1 |
343 | | # endif |
344 | | #endif |
345 | | |
346 | | /*!XXPH_ACCEPT_NULL_INPUT_POINTER : |
347 | | * If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault. |
348 | | * When this macro is enabled, xxHash actively checks input for null pointer. |
349 | | * It it is, result for null input pointers is the same as a null-length input. |
350 | | */ |
351 | | #ifndef XXPH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ |
352 | | # define XXPH_ACCEPT_NULL_INPUT_POINTER 0 |
353 | | #endif |
354 | | |
355 | | /*!XXPH_FORCE_ALIGN_CHECK : |
356 | | * This is a minor performance trick, only useful with lots of very small keys. |
357 | | * It means : check for aligned/unaligned input. |
358 | | * The check costs one initial branch per hash; |
359 | | * set it to 0 when the input is guaranteed to be aligned, |
360 | | * or when alignment doesn't matter for performance. |
361 | | */ |
362 | | #ifndef XXPH_FORCE_ALIGN_CHECK /* can be defined externally */ |
363 | | # if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) |
364 | | # define XXPH_FORCE_ALIGN_CHECK 0 |
365 | | # else |
366 | | # define XXPH_FORCE_ALIGN_CHECK 1 |
367 | | # endif |
368 | | #endif |
369 | | |
370 | | /*!XXPH_REROLL: |
371 | | * Whether to reroll XXPH32_finalize, and XXPH64_finalize, |
372 | | * instead of using an unrolled jump table/if statement loop. |
373 | | * |
374 | | * This is automatically defined on -Os/-Oz on GCC and Clang. */ |
375 | | #ifndef XXPH_REROLL |
376 | | # if defined(__OPTIMIZE_SIZE__) |
377 | | # define XXPH_REROLL 1 |
378 | | # else |
379 | | # define XXPH_REROLL 0 |
380 | | # endif |
381 | | #endif |
382 | | |
383 | | #include <limits.h> /* ULLONG_MAX */ |
384 | | |
385 | | #ifndef XXPH_STATIC_LINKING_ONLY |
386 | | #define XXPH_STATIC_LINKING_ONLY |
387 | | #endif |
388 | | |
389 | | /* ************************************* |
390 | | * Compiler Specific Options |
391 | | ***************************************/ |
392 | | #ifdef _MSC_VER /* Visual Studio */ |
393 | | # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ |
394 | | # define XXPH_FORCE_INLINE static __forceinline |
395 | | # define XXPH_NO_INLINE static __declspec(noinline) |
396 | | #else |
397 | | # if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ |
398 | | # ifdef __GNUC__ |
399 | | # define XXPH_FORCE_INLINE static inline __attribute__((always_inline)) |
400 | | # define XXPH_NO_INLINE static __attribute__((noinline)) |
401 | | # else |
402 | | # define XXPH_FORCE_INLINE static inline |
403 | | # define XXPH_NO_INLINE static |
404 | | # endif |
405 | | # else |
406 | | # define XXPH_FORCE_INLINE static |
407 | | # define XXPH_NO_INLINE static |
408 | | # endif /* __STDC_VERSION__ */ |
409 | | #endif |
410 | | |
411 | | |
412 | | |
413 | | /* ************************************* |
414 | | * Debug |
415 | | ***************************************/ |
416 | | /* DEBUGLEVEL is expected to be defined externally, |
417 | | * typically through compiler command line. |
418 | | * Value must be a number. */ |
419 | | #ifndef DEBUGLEVEL |
420 | | # define DEBUGLEVEL 0 |
421 | | #endif |
422 | | |
423 | | #if (DEBUGLEVEL>=1) |
424 | | # include <assert.h> /* note : can still be disabled with NDEBUG */ |
425 | | # define XXPH_ASSERT(c) assert(c) |
426 | | #else |
427 | 39.7M | # define XXPH_ASSERT(c) ((void)0) |
428 | | #endif |
429 | | |
430 | | /* note : use after variable declarations */ |
431 | 139k | #define XXPH_STATIC_ASSERT(c) { enum { XXPH_sa = 1/(int)(!!(c)) }; } |
432 | | |
433 | | |
434 | | /* ************************************* |
435 | | * Basic Types |
436 | | ***************************************/ |
437 | | #if !defined (__VMS) \ |
438 | | && (defined (__cplusplus) \ |
439 | | || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) |
440 | | # include <stdint.h> |
441 | | typedef uint8_t xxh_u8; |
442 | | #else |
443 | | typedef unsigned char xxh_u8; |
444 | | #endif |
445 | | typedef XXPH32_hash_t xxh_u32; |
446 | | |
447 | | |
448 | | /* === Memory access === */ |
449 | | |
450 | | #if (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==2)) |
451 | | |
452 | | /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ |
453 | | static xxh_u32 XXPH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } |
454 | | |
455 | | #elif (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==1)) |
456 | | |
457 | | /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ |
458 | | /* currently only defined for gcc and icc */ |
459 | | typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; |
460 | | static xxh_u32 XXPH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } |
461 | | |
462 | | #else |
463 | | |
464 | | /* portable and safe solution. Generally efficient. |
465 | | * see : http://stackoverflow.com/a/32095106/646947 |
466 | | */ |
467 | | static xxh_u32 XXPH_read32(const void* memPtr) |
468 | 11.8M | { |
469 | 11.8M | xxh_u32 val; |
470 | 11.8M | memcpy(&val, memPtr, sizeof(val)); |
471 | 11.8M | return val; |
472 | 11.8M | } |
473 | | |
474 | | #endif /* XXPH_FORCE_DIRECT_MEMORY_ACCESS */ |
475 | | |
476 | | |
477 | | /* === Endianess === */ |
478 | | |
479 | | /* XXPH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */ |
480 | | #ifndef XXPH_CPU_LITTLE_ENDIAN |
481 | | # if defined(_WIN32) /* Windows is always little endian */ \ |
482 | | || defined(__LITTLE_ENDIAN__) \ |
483 | | || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) |
484 | 36.1M | # define XXPH_CPU_LITTLE_ENDIAN 1 |
485 | | # elif defined(__BIG_ENDIAN__) \ |
486 | | || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) |
487 | | # define XXPH_CPU_LITTLE_ENDIAN 0 |
488 | | # else |
489 | | static int XXPH_isLittleEndian(void) |
490 | | { |
491 | | const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; /* don't use static : performance detrimental */ |
492 | | return one.c[0]; |
493 | | } |
494 | | # define XXPH_CPU_LITTLE_ENDIAN XXPH_isLittleEndian() |
495 | | # endif |
496 | | #endif |
497 | | |
498 | | |
499 | | |
500 | | |
501 | | /* **************************************** |
502 | | * Compiler-specific Functions and Macros |
503 | | ******************************************/ |
504 | | #define XXPH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) |
505 | | |
506 | | #ifndef __has_builtin |
507 | | # define __has_builtin(x) 0 |
508 | | #endif |
509 | | |
510 | | #if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) && __has_builtin(__builtin_rotateleft64) |
511 | | # define XXPH_rotl32 __builtin_rotateleft32 |
512 | | # define XXPH_rotl64 __builtin_rotateleft64 |
513 | | /* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */ |
514 | | #elif defined(_MSC_VER) |
515 | | # define XXPH_rotl32(x,r) _rotl(x,r) |
516 | | # define XXPH_rotl64(x,r) _rotl64(x,r) |
517 | | #else |
518 | | # define XXPH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) |
519 | | # define XXPH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) |
520 | | #endif |
521 | | |
522 | | #if defined(_MSC_VER) /* Visual Studio */ |
523 | | # define XXPH_swap32 _byteswap_ulong |
524 | | #elif XXPH_GCC_VERSION >= 403 |
525 | | # define XXPH_swap32 __builtin_bswap32 |
526 | | #else |
527 | | static xxh_u32 XXPH_swap32 (xxh_u32 x) |
528 | 0 | { |
529 | 0 | return ((x << 24) & 0xff000000 ) | |
530 | 0 | ((x << 8) & 0x00ff0000 ) | |
531 | 0 | ((x >> 8) & 0x0000ff00 ) | |
532 | 0 | ((x >> 24) & 0x000000ff ); |
533 | 0 | } |
534 | | #endif |
535 | | |
536 | | |
537 | | /* *************************** |
538 | | * Memory reads |
539 | | *****************************/ |
540 | | typedef enum { XXPH_aligned, XXPH_unaligned } XXPH_alignment; |
541 | | |
542 | | XXPH_FORCE_INLINE xxh_u32 XXPH_readLE32(const void* ptr) |
543 | 11.8M | { |
544 | 11.8M | return XXPH_CPU_LITTLE_ENDIAN ? XXPH_read32(ptr) : XXPH_swap32(XXPH_read32(ptr)); |
545 | 11.8M | } |
546 | | |
547 | | XXPH_FORCE_INLINE xxh_u32 |
548 | | XXPH_readLE32_align(const void* ptr, XXPH_alignment align) |
549 | 0 | { |
550 | 0 | if (align==XXPH_unaligned) { |
551 | 0 | return XXPH_readLE32(ptr); |
552 | 0 | } else { |
553 | 0 | return XXPH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXPH_swap32(*(const xxh_u32*)ptr); |
554 | 0 | } |
555 | 0 | } |
556 | | |
557 | | |
558 | | /* ************************************* |
559 | | * Misc |
560 | | ***************************************/ |
561 | 0 | XXPH_PUBLIC_API unsigned XXPH_versionNumber (void) { return XXPH_VERSION_NUMBER; } |
562 | | |
563 | | |
564 | | static const xxh_u32 PRIME32_1 = 0x9E3779B1U; /* 0b10011110001101110111100110110001 */ |
565 | | static const xxh_u32 PRIME32_2 = 0x85EBCA77U; /* 0b10000101111010111100101001110111 */ |
566 | | static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */ |
567 | | static const xxh_u32 PRIME32_4 = 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */ |
568 | | static const xxh_u32 PRIME32_5 = 0x165667B1U; /* 0b00010110010101100110011110110001 */ |
569 | | |
570 | | #ifndef XXPH_NO_LONG_LONG |
571 | | |
572 | | /* ******************************************************************* |
573 | | * 64-bit hash functions |
574 | | *********************************************************************/ |
575 | | |
576 | | /*====== Memory access ======*/ |
577 | | |
578 | | typedef XXPH64_hash_t xxh_u64; |
579 | | |
580 | | #if (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==2)) |
581 | | |
582 | | /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ |
583 | | static xxh_u64 XXPH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; } |
584 | | |
585 | | #elif (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==1)) |
586 | | |
587 | | /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ |
588 | | /* currently only defined for gcc and icc */ |
589 | | typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; |
590 | | static xxh_u64 XXPH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; } |
591 | | |
592 | | #else |
593 | | |
594 | | /* portable and safe solution. Generally efficient. |
595 | | * see : http://stackoverflow.com/a/32095106/646947 |
596 | | */ |
597 | | |
598 | | static xxh_u64 XXPH_read64(const void* memPtr) |
599 | 23.2M | { |
600 | 23.2M | xxh_u64 val; |
601 | 23.2M | memcpy(&val, memPtr, sizeof(val)); |
602 | 23.2M | return val; |
603 | 23.2M | } |
604 | | |
605 | | #endif /* XXPH_FORCE_DIRECT_MEMORY_ACCESS */ |
606 | | |
607 | | #if defined(_MSC_VER) /* Visual Studio */ |
608 | | # define XXPH_swap64 _byteswap_uint64 |
609 | | #elif XXPH_GCC_VERSION >= 403 |
610 | | # define XXPH_swap64 __builtin_bswap64 |
611 | | #else |
612 | | static xxh_u64 XXPH_swap64 (xxh_u64 x) |
613 | 0 | { |
614 | 0 | return ((x << 56) & 0xff00000000000000ULL) | |
615 | 0 | ((x << 40) & 0x00ff000000000000ULL) | |
616 | 0 | ((x << 24) & 0x0000ff0000000000ULL) | |
617 | 0 | ((x << 8) & 0x000000ff00000000ULL) | |
618 | 0 | ((x >> 8) & 0x00000000ff000000ULL) | |
619 | 0 | ((x >> 24) & 0x0000000000ff0000ULL) | |
620 | 0 | ((x >> 40) & 0x000000000000ff00ULL) | |
621 | 0 | ((x >> 56) & 0x00000000000000ffULL); |
622 | 0 | } |
623 | | #endif |
624 | | |
625 | | XXPH_FORCE_INLINE xxh_u64 XXPH_readLE64(const void* ptr) |
626 | 23.2M | { |
627 | 23.2M | return XXPH_CPU_LITTLE_ENDIAN ? XXPH_read64(ptr) : XXPH_swap64(XXPH_read64(ptr)); |
628 | 23.2M | } |
629 | | |
630 | | XXPH_FORCE_INLINE xxh_u64 |
631 | | XXPH_readLE64_align(const void* ptr, XXPH_alignment align) |
632 | 0 | { |
633 | 0 | if (align==XXPH_unaligned) |
634 | 0 | return XXPH_readLE64(ptr); |
635 | 0 | else |
636 | 0 | return XXPH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXPH_swap64(*(const xxh_u64*)ptr); |
637 | 0 | } |
638 | | |
639 | | |
640 | | /*====== xxh64 ======*/ |
641 | | |
642 | | static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */ |
643 | | static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */ |
644 | | static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */ |
645 | | static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */ |
646 | | static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */ |
647 | | |
648 | | |
649 | | /* ********************************************************************* |
650 | | * XXPH3 |
651 | | * New generation hash designed for speed on small keys and vectorization |
652 | | ************************************************************************ */ |
653 | | |
654 | | /*======== Was #include "xxh3.h", now inlined below ==========*/ |
655 | | |
656 | | /* |
657 | | xxHash - Extremely Fast Hash algorithm |
658 | | Development source file for `xxh3` |
659 | | Copyright (C) 2019-present, Yann Collet. |
660 | | |
661 | | BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) |
662 | | |
663 | | Redistribution and use in source and binary forms, with or without |
664 | | modification, are permitted provided that the following conditions are |
665 | | met: |
666 | | |
667 | | * Redistributions of source code must retain the above copyright |
668 | | notice, this list of conditions and the following disclaimer. |
669 | | * Redistributions in binary form must reproduce the above |
670 | | copyright notice, this list of conditions and the following disclaimer |
671 | | in the documentation and/or other materials provided with the |
672 | | distribution. |
673 | | |
674 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
675 | | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
676 | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
677 | | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
678 | | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
679 | | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
680 | | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
681 | | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
682 | | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
683 | | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
684 | | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
685 | | |
686 | | You can contact the author at : |
687 | | - xxHash source repository : https://github.com/Cyan4973/xxHash |
688 | | */ |
689 | | |
690 | | /* RocksDB Note: This file contains a preview release (xxhash repository |
691 | | version 0.7.2) of XXPH3 that is unlikely to be compatible with the final |
692 | | version of XXPH3. We have therefore renamed this XXPH3 ("preview"), for |
693 | | clarity so that we can continue to use this version even after |
694 | | integrating a newer incompatible version. |
695 | | */ |
696 | | |
697 | | /* === Dependencies === */ |
698 | | |
699 | | #undef XXPH_INLINE_ALL /* in case it's already defined */ |
700 | | #define XXPH_INLINE_ALL |
701 | | |
702 | | |
703 | | /* === Compiler specifics === */ |
704 | | |
705 | | #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ |
706 | | # define XXPH_RESTRICT restrict |
707 | | #else |
708 | | /* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */ |
709 | | # define XXPH_RESTRICT /* disable */ |
710 | | #endif |
711 | | |
712 | | #if defined(__GNUC__) |
713 | | # if defined(__AVX2__) |
714 | | # include <immintrin.h> |
715 | | # elif defined(__SSE2__) |
716 | | # include <emmintrin.h> |
717 | | # elif defined(__ARM_NEON__) || defined(__ARM_NEON) |
718 | | # define inline __inline__ /* clang bug */ |
719 | | # include <arm_neon.h> |
720 | | # undef inline |
721 | | # endif |
722 | | #elif defined(_MSC_VER) |
723 | | # include <intrin.h> |
724 | | #endif |
725 | | |
726 | | /* |
727 | | * Sanity check. |
728 | | * |
729 | | * XXPH3 only requires these features to be efficient: |
730 | | * |
731 | | * - Usable unaligned access |
732 | | * - A 32-bit or 64-bit ALU |
733 | | * - If 32-bit, a decent ADC instruction |
734 | | * - A 32 or 64-bit multiply with a 64-bit result |
735 | | * |
736 | | * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the |
737 | | * classic 16-bit only subset of ARM's instruction set. |
738 | | * |
739 | | * First of all, Thumb-1 lacks support for the UMULL instruction which |
740 | | * performs the important long multiply. This means numerous __aeabi_lmul |
741 | | * calls. |
742 | | * |
743 | | * Second of all, the 8 functional registers are just not enough. |
744 | | * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need |
745 | | * Lo registers, and this shuffling results in thousands more MOVs than A32. |
746 | | * |
747 | | * A32 and T32 don't have this limitation. They can access all 14 registers, |
748 | | * do a 32->64 multiply with UMULL, and the flexible operand is helpful too. |
749 | | * |
750 | | * If compiling Thumb-1 for a target which supports ARM instructions, we |
751 | | * will give a warning. |
752 | | * |
753 | | * Usually, if this happens, it is because of an accident and you probably |
754 | | * need to specify -march, as you probably meant to compileh for a newer |
755 | | * architecture. |
756 | | */ |
757 | | #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) |
758 | | # warning "XXPH3 is highly inefficient without ARM or Thumb-2." |
759 | | #endif |
760 | | |
761 | | /* ========================================== |
762 | | * Vectorization detection |
763 | | * ========================================== */ |
764 | | #define XXPH_SCALAR 0 |
765 | | #define XXPH_SSE2 1 |
766 | | #define XXPH_AVX2 2 |
767 | | #define XXPH_NEON 3 |
768 | | #define XXPH_VSX 4 |
769 | | |
770 | | #ifndef XXPH_VECTOR /* can be defined on command line */ |
771 | | # if defined(__AVX2__) |
772 | | # define XXPH_VECTOR XXPH_AVX2 |
773 | | # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) |
774 | | # define XXPH_VECTOR XXPH_SSE2 |
775 | | # elif defined(__GNUC__) /* msvc support maybe later */ \ |
776 | | && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \ |
777 | | && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ |
778 | | || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) |
779 | | # define XXPH_VECTOR XXPH_NEON |
780 | | # elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__) |
781 | | # define XXPH_VECTOR XXPH_VSX |
782 | | # else |
783 | | # define XXPH_VECTOR XXPH_SCALAR |
784 | | # endif |
785 | | #endif |
786 | | |
787 | | /* control alignment of accumulator, |
788 | | * for compatibility with fast vector loads */ |
789 | | #ifndef XXPH_ACC_ALIGN |
790 | | # if XXPH_VECTOR == 0 /* scalar */ |
791 | | # define XXPH_ACC_ALIGN 8 |
792 | | # elif XXPH_VECTOR == 1 /* sse2 */ |
793 | | # define XXPH_ACC_ALIGN 16 |
794 | | # elif XXPH_VECTOR == 2 /* avx2 */ |
795 | | # define XXPH_ACC_ALIGN 32 |
796 | | # elif XXPH_VECTOR == 3 /* neon */ |
797 | | # define XXPH_ACC_ALIGN 16 |
798 | | # elif XXPH_VECTOR == 4 /* vsx */ |
799 | | # define XXPH_ACC_ALIGN 16 |
800 | | # endif |
801 | | #endif |
802 | | |
803 | | /* xxh_u64 XXPH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */ |
804 | | #if defined(_MSC_VER) && defined(_M_IX86) |
805 | | # include <intrin.h> |
806 | | # define XXPH_mult32to64(x, y) __emulu(x, y) |
807 | | #else |
808 | | # define XXPH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF)) |
809 | | #endif |
810 | | |
811 | | /* VSX stuff. It's a lot because VSX support is mediocre across compilers and |
812 | | * there is a lot of mischief with endianness. */ |
813 | | #if XXPH_VECTOR == XXPH_VSX |
814 | | # include <altivec.h> |
815 | | # undef vector |
816 | | typedef __vector unsigned long long U64x2; |
817 | | typedef __vector unsigned char U8x16; |
818 | | typedef __vector unsigned U32x4; |
819 | | |
820 | | #ifndef XXPH_VSX_BE |
821 | | # if defined(__BIG_ENDIAN__) \ |
822 | | || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) |
823 | | # define XXPH_VSX_BE 1 |
824 | | # elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ |
825 | | # warning "-maltivec=be is not recommended. Please use native endianness." |
826 | | # define XXPH_VSX_BE 1 |
827 | | # else |
828 | | # define XXPH_VSX_BE 0 |
829 | | # endif |
830 | | #endif |
831 | | |
832 | | /* We need some helpers for big endian mode. */ |
833 | | #if XXPH_VSX_BE |
834 | | /* A wrapper for POWER9's vec_revb. */ |
835 | | # ifdef __POWER9_VECTOR__ |
836 | | # define XXPH_vec_revb vec_revb |
837 | | # else |
838 | | XXPH_FORCE_INLINE U64x2 XXPH_vec_revb(U64x2 val) |
839 | | { |
840 | | U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, |
841 | | 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; |
842 | | return vec_perm(val, val, vByteSwap); |
843 | | } |
844 | | # endif |
845 | | |
846 | | /* Power8 Crypto gives us vpermxor which is very handy for |
847 | | * PPC64EB. |
848 | | * |
849 | | * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask) |
850 | | * { |
851 | | * U8x16 ret; |
852 | | * for (int i = 0; i < 16; i++) { |
853 | | * ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4]; |
854 | | * } |
855 | | * return ret; |
856 | | * } |
857 | | * |
858 | | * Because both of the main loops load the key, swap, and xor it with input, |
859 | | * we can combine the key swap into this instruction. |
860 | | */ |
861 | | # ifdef vec_permxor |
862 | | # define XXPH_vec_permxor vec_permxor |
863 | | # else |
864 | | # define XXPH_vec_permxor __builtin_crypto_vpermxor |
865 | | # endif |
866 | | #endif /* XXPH_VSX_BE */ |
867 | | /* |
868 | | * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes |
869 | | * vec_mule. |
870 | | * |
871 | | * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while. |
872 | | * Clang has an easy way to control this, we can just use the builtin which doesn't swap. |
873 | | * GCC needs inline assembly. */ |
874 | | #if __has_builtin(__builtin_altivec_vmuleuw) |
875 | | # define XXPH_vec_mulo __builtin_altivec_vmulouw |
876 | | # define XXPH_vec_mule __builtin_altivec_vmuleuw |
877 | | #else |
878 | | /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ |
879 | | XXPH_FORCE_INLINE U64x2 XXPH_vec_mulo(U32x4 a, U32x4 b) { |
880 | | U64x2 result; |
881 | | __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); |
882 | | return result; |
883 | | } |
884 | | XXPH_FORCE_INLINE U64x2 XXPH_vec_mule(U32x4 a, U32x4 b) { |
885 | | U64x2 result; |
886 | | __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); |
887 | | return result; |
888 | | } |
889 | | #endif /* __has_builtin(__builtin_altivec_vmuleuw) */ |
890 | | #endif /* XXPH_VECTOR == XXPH_VSX */ |
891 | | |
892 | | /* prefetch |
893 | | * can be disabled, by declaring XXPH_NO_PREFETCH build macro */ |
894 | | #if defined(XXPH_NO_PREFETCH) |
895 | | # define XXPH_PREFETCH(ptr) (void)(ptr) /* disabled */ |
896 | | #else |
897 | | #if defined(_MSC_VER) && \ |
898 | | (defined(_M_X64) || \ |
899 | | defined(_M_IX86)) /* _mm_prefetch() is not defined outside of x86/x64 */ |
900 | | # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ |
901 | | # define XXPH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) |
902 | | # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) |
903 | 5.63M | # define XXPH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) |
904 | | # else |
905 | | # define XXPH_PREFETCH(ptr) (void)(ptr) /* disabled */ |
906 | | # endif |
907 | | #endif /* XXPH_NO_PREFETCH */ |
908 | | |
909 | | |
910 | | /* ========================================== |
911 | | * XXPH3 default settings |
912 | | * ========================================== */ |
913 | | |
914 | 43.1k | #define XXPH_SECRET_DEFAULT_SIZE 192 /* minimum XXPH3_SECRET_SIZE_MIN */ |
915 | | |
916 | | #if (XXPH_SECRET_DEFAULT_SIZE < XXPH3_SECRET_SIZE_MIN) |
917 | | # error "default keyset is not large enough" |
918 | | #endif |
919 | | |
920 | | XXPH_ALIGN(64) static const xxh_u8 kSecret[XXPH_SECRET_DEFAULT_SIZE] = { |
921 | | 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, |
922 | | 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, |
923 | | 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, |
924 | | 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, |
925 | | 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, |
926 | | 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, |
927 | | 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, |
928 | | 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, |
929 | | |
930 | | 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, |
931 | | 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, |
932 | | 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, |
933 | | 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, |
934 | | }; |
935 | | |
936 | | /* |
937 | | * GCC for x86 has a tendency to use SSE in this loop. While it |
938 | | * successfully avoids swapping (as MUL overwrites EAX and EDX), it |
939 | | * slows it down because instead of free register swap shifts, it |
940 | | * must use pshufd and punpckl/hd. |
941 | | * |
942 | | * To prevent this, we use this attribute to shut off SSE. |
943 | | */ |
944 | | #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__) |
945 | | __attribute__((__target__("no-sse"))) |
946 | | #endif |
947 | | static XXPH128_hash_t |
948 | | XXPH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) |
949 | 6.20M | { |
950 | | /* |
951 | | * GCC/Clang __uint128_t method. |
952 | | * |
953 | | * On most 64-bit targets, GCC and Clang define a __uint128_t type. |
954 | | * This is usually the best way as it usually uses a native long 64-bit |
955 | | * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. |
956 | | * |
957 | | * Usually. |
958 | | * |
959 | | * Despite being a 32-bit platform, Clang (and emscripten) define this |
960 | | * type despite not having the arithmetic for it. This results in a |
961 | | * laggy compiler builtin call which calculates a full 128-bit multiply. |
962 | | * In that case it is best to use the portable one. |
963 | | * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 |
964 | | */ |
965 | 6.20M | #if defined(__GNUC__) && !defined(__wasm__) \ |
966 | 6.20M | && defined(__SIZEOF_INT128__) \ |
967 | 6.20M | || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) |
968 | | |
969 | 6.20M | __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs; |
970 | 6.20M | XXPH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) }; |
971 | 6.20M | return r128; |
972 | | |
973 | | /* |
974 | | * MSVC for x64's _umul128 method. |
975 | | * |
976 | | * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); |
977 | | * |
978 | | * This compiles to single operand MUL on x64. |
979 | | */ |
980 | | #elif defined(_M_X64) || defined(_M_IA64) |
981 | | |
982 | | #ifndef _MSC_VER |
983 | | # pragma intrinsic(_umul128) |
984 | | #endif |
985 | | xxh_u64 product_high; |
986 | | xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); |
987 | | XXPH128_hash_t const r128 = { product_low, product_high }; |
988 | | return r128; |
989 | | |
990 | | #else |
991 | | /* |
992 | | * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. |
993 | | * |
994 | | * This is a fast and simple grade school multiply, which is shown |
995 | | * below with base 10 arithmetic instead of base 0x100000000. |
996 | | * |
997 | | * 9 3 // D2 lhs = 93 |
998 | | * x 7 5 // D2 rhs = 75 |
999 | | * ---------- |
1000 | | * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) |
1001 | | * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) |
1002 | | * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) |
1003 | | * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) |
1004 | | * --------- |
1005 | | * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 |
1006 | | * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 |
1007 | | * --------- |
1008 | | * 6 9 7 5 |
1009 | | * |
1010 | | * The reasons for adding the products like this are: |
1011 | | * 1. It avoids manual carry tracking. Just like how |
1012 | | * (9 * 9) + 9 + 9 = 99, the same applies with this for |
1013 | | * UINT64_MAX. This avoids a lot of complexity. |
1014 | | * |
1015 | | * 2. It hints for, and on Clang, compiles to, the powerful UMAAL |
1016 | | * instruction available in ARMv6+ A32/T32, which is shown below: |
1017 | | * |
1018 | | * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) |
1019 | | * { |
1020 | | * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; |
1021 | | * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); |
1022 | | * *RdHi = (xxh_u32)(product >> 32); |
1023 | | * } |
1024 | | * |
1025 | | * This instruction was designed for efficient long multiplication, |
1026 | | * and allows this to be calculated in only 4 instructions which |
1027 | | * is comparable to some 64-bit ALUs. |
1028 | | * |
1029 | | * 3. It isn't terrible on other platforms. Usually this will be |
1030 | | * a couple of 32-bit ADD/ADCs. |
1031 | | */ |
1032 | | |
1033 | | /* First calculate all of the cross products. */ |
1034 | | xxh_u64 const lo_lo = XXPH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); |
1035 | | xxh_u64 const hi_lo = XXPH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); |
1036 | | xxh_u64 const lo_hi = XXPH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); |
1037 | | xxh_u64 const hi_hi = XXPH_mult32to64(lhs >> 32, rhs >> 32); |
1038 | | |
1039 | | /* Now add the products together. These will never overflow. */ |
1040 | | xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; |
1041 | | xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; |
1042 | | xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); |
1043 | | |
1044 | | XXPH128_hash_t r128 = { lower, upper }; |
1045 | | return r128; |
1046 | | #endif |
1047 | 6.20M | } |
1048 | | |
1049 | | /* |
1050 | | * We want to keep the attribute here because a target switch |
1051 | | * disables inlining. |
1052 | | * |
1053 | | * Does a 64-bit to 128-bit multiply, then XOR folds it. |
1054 | | * The reason for the separate function is to prevent passing |
1055 | | * too many structs around by value. This will hopefully inline |
1056 | | * the multiply, but we don't force it. |
1057 | | */ |
1058 | | #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__) |
1059 | | __attribute__((__target__("no-sse"))) |
1060 | | #endif |
1061 | | static xxh_u64 |
1062 | | XXPH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) |
1063 | 6.20M | { |
1064 | 6.20M | XXPH128_hash_t product = XXPH_mult64to128(lhs, rhs); |
1065 | 6.20M | return product.low64 ^ product.high64; |
1066 | 6.20M | } |
1067 | | |
1068 | | |
1069 | | static XXPH64_hash_t XXPH3_avalanche(xxh_u64 h64) |
1070 | 8.32M | { |
1071 | 8.32M | h64 ^= h64 >> 37; |
1072 | 8.32M | h64 *= PRIME64_3; |
1073 | 8.32M | h64 ^= h64 >> 32; |
1074 | 8.32M | return h64; |
1075 | 8.32M | } |
1076 | | |
1077 | | |
1078 | | /* ========================================== |
1079 | | * Short keys |
1080 | | * ========================================== */ |
1081 | | |
1082 | | XXPH_FORCE_INLINE XXPH64_hash_t |
1083 | | XXPH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed) |
1084 | 2.81M | { |
1085 | 2.81M | XXPH_ASSERT(input != NULL); |
1086 | 2.81M | XXPH_ASSERT(1 <= len && len <= 3); |
1087 | 2.81M | XXPH_ASSERT(secret != NULL); |
1088 | 2.81M | { xxh_u8 const c1 = input[0]; |
1089 | 2.81M | xxh_u8 const c2 = input[len >> 1]; |
1090 | 2.81M | xxh_u8 const c3 = input[len - 1]; |
1091 | 2.81M | xxh_u32 const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24); |
1092 | 2.81M | xxh_u64 const keyed = (xxh_u64)combined ^ (XXPH_readLE32(secret) + seed); |
1093 | 2.81M | xxh_u64 const mixed = keyed * PRIME64_1; |
1094 | 2.81M | return XXPH3_avalanche(mixed); |
1095 | 2.81M | } |
1096 | 2.81M | } |
1097 | | |
1098 | | XXPH_FORCE_INLINE XXPH64_hash_t |
1099 | | XXPH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed) |
1100 | 4.53M | { |
1101 | 4.53M | XXPH_ASSERT(input != NULL); |
1102 | 4.53M | XXPH_ASSERT(secret != NULL); |
1103 | 4.53M | XXPH_ASSERT(4 <= len && len <= 8); |
1104 | 4.53M | { xxh_u32 const input_lo = XXPH_readLE32(input); |
1105 | 4.53M | xxh_u32 const input_hi = XXPH_readLE32(input + len - 4); |
1106 | 4.53M | xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32); |
1107 | 4.53M | xxh_u64 const keyed = input_64 ^ (XXPH_readLE64(secret) + seed); |
1108 | 4.53M | xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1); |
1109 | 4.53M | return XXPH3_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2); |
1110 | 4.53M | } |
1111 | 4.53M | } |
1112 | | |
1113 | | XXPH_FORCE_INLINE XXPH64_hash_t |
1114 | | XXPH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed) |
1115 | 78.1k | { |
1116 | 78.1k | XXPH_ASSERT(input != NULL); |
1117 | 78.1k | XXPH_ASSERT(secret != NULL); |
1118 | 78.1k | XXPH_ASSERT(9 <= len && len <= 16); |
1119 | 78.1k | { xxh_u64 const input_lo = XXPH_readLE64(input) ^ (XXPH_readLE64(secret) + seed); |
1120 | 78.1k | xxh_u64 const input_hi = XXPH_readLE64(input + len - 8) ^ (XXPH_readLE64(secret + 8) - seed); |
1121 | 78.1k | xxh_u64 const acc = len + (input_lo + input_hi) + XXPH3_mul128_fold64(input_lo, input_hi); |
1122 | 78.1k | return XXPH3_avalanche(acc); |
1123 | 78.1k | } |
1124 | 78.1k | } |
1125 | | |
1126 | | XXPH_FORCE_INLINE XXPH64_hash_t |
1127 | | XXPH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed) |
1128 | 9.54M | { |
1129 | 9.54M | XXPH_ASSERT(len <= 16); |
1130 | 9.54M | { if (len > 8) return XXPH3_len_9to16_64b(input, len, secret, seed); |
1131 | 9.46M | if (len >= 4) return XXPH3_len_4to8_64b(input, len, secret, seed); |
1132 | 4.93M | if (len) return XXPH3_len_1to3_64b(input, len, secret, seed); |
1133 | | /* |
1134 | | * RocksDB modification from XXPH3 preview: zero result for empty |
1135 | | * string can be problematic for multiplication-based algorithms. |
1136 | | * Return a hash of the seed instead. |
1137 | | */ |
1138 | 2.11M | return XXPH3_mul128_fold64(seed + XXPH_readLE64(secret), PRIME64_2); |
1139 | 4.93M | } |
1140 | 4.93M | } |
1141 | | |
1142 | | |
1143 | | /* === Long Keys === */ |
1144 | | |
1145 | 24.6M | #define STRIPE_LEN 64 |
1146 | 5.73M | #define XXPH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ |
1147 | | #define ACC_NB (STRIPE_LEN / sizeof(xxh_u64)) |
1148 | | |
1149 | | typedef enum { XXPH3_acc_64bits, XXPH3_acc_128bits } XXPH3_accWidth_e; |
1150 | | |
1151 | | XXPH_FORCE_INLINE void |
1152 | | XXPH3_accumulate_512( void* XXPH_RESTRICT acc, |
1153 | | const void* XXPH_RESTRICT input, |
1154 | | const void* XXPH_RESTRICT secret, |
1155 | | XXPH3_accWidth_e accWidth) |
1156 | 5.72M | { |
1157 | 5.72M | #if (XXPH_VECTOR == XXPH_AVX2) |
1158 | | |
1159 | 5.72M | XXPH_ASSERT((((size_t)acc) & 31) == 0); |
1160 | 5.72M | { XXPH_ALIGN(32) __m256i* const xacc = (__m256i *) acc; |
1161 | 5.72M | const __m256i* const xinput = (const __m256i *) input; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */ |
1162 | 5.72M | const __m256i* const xsecret = (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */ |
1163 | | |
1164 | 5.72M | size_t i; |
1165 | 17.1M | for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { |
1166 | 11.4M | __m256i const data_vec = _mm256_loadu_si256 (xinput+i); |
1167 | 11.4M | __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); |
1168 | 11.4M | __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */ |
1169 | 11.4M | __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */ |
1170 | 11.4M | if (accWidth == XXPH3_acc_128bits) { |
1171 | 0 | __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); |
1172 | 0 | __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); |
1173 | 0 | xacc[i] = _mm256_add_epi64(product, sum); |
1174 | 11.4M | } else { /* XXPH3_acc_64bits */ |
1175 | 11.4M | __m256i const sum = _mm256_add_epi64(xacc[i], data_vec); |
1176 | 11.4M | xacc[i] = _mm256_add_epi64(product, sum); |
1177 | 11.4M | } |
1178 | 11.4M | } } |
1179 | | |
1180 | | #elif (XXPH_VECTOR == XXPH_SSE2) |
1181 | | |
1182 | | XXPH_ASSERT((((size_t)acc) & 15) == 0); |
1183 | | { XXPH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; |
1184 | | const __m128i* const xinput = (const __m128i *) input; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */ |
1185 | | const __m128i* const xsecret = (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */ |
1186 | | |
1187 | | size_t i; |
1188 | | for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { |
1189 | | __m128i const data_vec = _mm_loadu_si128 (xinput+i); |
1190 | | __m128i const key_vec = _mm_loadu_si128 (xsecret+i); |
1191 | | __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */ |
1192 | | __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */ |
1193 | | if (accWidth == XXPH3_acc_128bits) { |
1194 | | __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); |
1195 | | __m128i const sum = _mm_add_epi64(xacc[i], data_swap); |
1196 | | xacc[i] = _mm_add_epi64(product, sum); |
1197 | | } else { /* XXPH3_acc_64bits */ |
1198 | | __m128i const sum = _mm_add_epi64(xacc[i], data_vec); |
1199 | | xacc[i] = _mm_add_epi64(product, sum); |
1200 | | } |
1201 | | } } |
1202 | | |
1203 | | #elif (XXPH_VECTOR == XXPH_NEON) |
1204 | | |
1205 | | XXPH_ASSERT((((size_t)acc) & 15) == 0); |
1206 | | { |
1207 | | XXPH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc; |
1208 | | /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ |
1209 | | uint8_t const* const xinput = (const uint8_t *) input; |
1210 | | uint8_t const* const xsecret = (const uint8_t *) secret; |
1211 | | |
1212 | | size_t i; |
1213 | | for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) { |
1214 | | #if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */ |
1215 | | /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this. |
1216 | | * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang |
1217 | | * assumes I don't want to destroy it and tries to make a copy. This slows down the code |
1218 | | * a lot. |
1219 | | * aarch64 not only uses an entirely different syntax, but it requires three |
1220 | | * instructions... |
1221 | | * ext v1.16B, v0.16B, #8 // select high bits because aarch64 can't address them directly |
1222 | | * zip1 v3.2s, v0.2s, v1.2s // first zip |
1223 | | * zip2 v2.2s, v0.2s, v1.2s // second zip |
1224 | | * ...to do what ARM does in one: |
1225 | | * vzip.32 d0, d1 // Interleave high and low bits and overwrite. */ |
1226 | | |
1227 | | /* data_vec = xsecret[i]; */ |
1228 | | uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16)); |
1229 | | /* key_vec = xsecret[i]; */ |
1230 | | uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16)); |
1231 | | /* data_key = data_vec ^ key_vec; */ |
1232 | | uint32x4_t data_key; |
1233 | | |
1234 | | if (accWidth == XXPH3_acc_64bits) { |
1235 | | /* Add first to prevent register swaps */ |
1236 | | /* xacc[i] += data_vec; */ |
1237 | | xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); |
1238 | | } else { /* XXPH3_acc_128bits */ |
1239 | | /* xacc[i] += swap(data_vec); */ |
1240 | | /* can probably be optimized better */ |
1241 | | uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); |
1242 | | uint64x2_t const swapped= vextq_u64(data64, data64, 1); |
1243 | | xacc[i] = vaddq_u64 (xacc[i], swapped); |
1244 | | } |
1245 | | |
1246 | | data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec)); |
1247 | | |
1248 | | /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place. |
1249 | | * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */ |
1250 | | __asm__("vzip.32 %e0, %f0" : "+w" (data_key)); |
1251 | | /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */ |
1252 | | xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key)); |
1253 | | |
1254 | | #else |
1255 | | /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */ |
1256 | | |
1257 | | /* data_vec = xsecret[i]; */ |
1258 | | uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16)); |
1259 | | /* key_vec = xsecret[i]; */ |
1260 | | uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16)); |
1261 | | /* data_key = data_vec ^ key_vec; */ |
1262 | | uint64x2_t const data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); |
1263 | | /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */ |
1264 | | uint32x2_t const data_key_lo = vmovn_u64 (data_key); |
1265 | | /* data_key_hi = (uint32x2_t) (data_key >> 32); */ |
1266 | | uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32); |
1267 | | if (accWidth == XXPH3_acc_64bits) { |
1268 | | /* xacc[i] += data_vec; */ |
1269 | | xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); |
1270 | | } else { /* XXPH3_acc_128bits */ |
1271 | | /* xacc[i] += swap(data_vec); */ |
1272 | | uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); |
1273 | | uint64x2_t const swapped= vextq_u64(data64, data64, 1); |
1274 | | xacc[i] = vaddq_u64 (xacc[i], swapped); |
1275 | | } |
1276 | | /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ |
1277 | | xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); |
1278 | | |
1279 | | #endif |
1280 | | } |
1281 | | } |
1282 | | |
1283 | | #elif (XXPH_VECTOR == XXPH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5) |
1284 | | U64x2* const xacc = (U64x2*) acc; /* presumed aligned */ |
1285 | | U64x2 const* const xinput = (U64x2 const*) input; /* no alignment restriction */ |
1286 | | U64x2 const* const xsecret = (U64x2 const*) secret; /* no alignment restriction */ |
1287 | | U64x2 const v32 = { 32, 32 }; |
1288 | | #if XXPH_VSX_BE |
1289 | | U8x16 const vXorSwap = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70, |
1290 | | 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 }; |
1291 | | #endif |
1292 | | size_t i; |
1293 | | for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) { |
1294 | | /* data_vec = xinput[i]; */ |
1295 | | /* key_vec = xsecret[i]; */ |
1296 | | #if XXPH_VSX_BE |
1297 | | /* byteswap */ |
1298 | | U64x2 const data_vec = XXPH_vec_revb(vec_vsx_ld(0, xinput + i)); |
1299 | | U64x2 const key_raw = vec_vsx_ld(0, xsecret + i); |
1300 | | /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */ |
1301 | | U64x2 const data_key = (U64x2)XXPH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap); |
1302 | | #else |
1303 | | U64x2 const data_vec = vec_vsx_ld(0, xinput + i); |
1304 | | U64x2 const key_vec = vec_vsx_ld(0, xsecret + i); |
1305 | | U64x2 const data_key = data_vec ^ key_vec; |
1306 | | #endif |
1307 | | /* shuffled = (data_key << 32) | (data_key >> 32); */ |
1308 | | U32x4 const shuffled = (U32x4)vec_rl(data_key, v32); |
1309 | | /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */ |
1310 | | U64x2 const product = XXPH_vec_mulo((U32x4)data_key, shuffled); |
1311 | | xacc[i] += product; |
1312 | | |
1313 | | if (accWidth == XXPH3_acc_64bits) { |
1314 | | xacc[i] += data_vec; |
1315 | | } else { /* XXPH3_acc_128bits */ |
1316 | | /* swap high and low halves */ |
1317 | | U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2); |
1318 | | xacc[i] += data_swapped; |
1319 | | } |
1320 | | } |
1321 | | |
1322 | | #else /* scalar variant of Accumulator - universal */ |
1323 | | |
1324 | | XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */ |
1325 | | const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ |
1326 | | const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ |
1327 | | size_t i; |
1328 | | XXPH_ASSERT(((size_t)acc & (XXPH_ACC_ALIGN-1)) == 0); |
1329 | | for (i=0; i < ACC_NB; i++) { |
1330 | | xxh_u64 const data_val = XXPH_readLE64(xinput + 8*i); |
1331 | | xxh_u64 const data_key = data_val ^ XXPH_readLE64(xsecret + i*8); |
1332 | | |
1333 | | if (accWidth == XXPH3_acc_64bits) { |
1334 | | xacc[i] += data_val; |
1335 | | } else { |
1336 | | xacc[i ^ 1] += data_val; /* swap adjacent lanes */ |
1337 | | } |
1338 | | xacc[i] += XXPH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); |
1339 | | } |
1340 | | #endif |
1341 | 5.72M | } |
1342 | | |
1343 | | XXPH_FORCE_INLINE void |
1344 | | XXPH3_scrambleAcc(void* XXPH_RESTRICT acc, const void* XXPH_RESTRICT secret) |
1345 | 313k | { |
1346 | 313k | #if (XXPH_VECTOR == XXPH_AVX2) |
1347 | | |
1348 | 313k | XXPH_ASSERT((((size_t)acc) & 31) == 0); |
1349 | 313k | { XXPH_ALIGN(32) __m256i* const xacc = (__m256i*) acc; |
1350 | 313k | const __m256i* const xsecret = (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */ |
1351 | 313k | const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1); |
1352 | | |
1353 | 313k | size_t i; |
1354 | 941k | for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { |
1355 | | /* xacc[i] ^= (xacc[i] >> 47) */ |
1356 | 627k | __m256i const acc_vec = xacc[i]; |
1357 | 627k | __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); |
1358 | 627k | __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); |
1359 | | /* xacc[i] ^= xsecret; */ |
1360 | 627k | __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); |
1361 | 627k | __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); |
1362 | | |
1363 | | /* xacc[i] *= PRIME32_1; */ |
1364 | 627k | __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31); |
1365 | 627k | __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); |
1366 | 627k | __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); |
1367 | 627k | xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); |
1368 | 627k | } |
1369 | 313k | } |
1370 | | |
1371 | | #elif (XXPH_VECTOR == XXPH_SSE2) |
1372 | | |
1373 | | XXPH_ASSERT((((size_t)acc) & 15) == 0); |
1374 | | { XXPH_ALIGN(16) __m128i* const xacc = (__m128i*) acc; |
1375 | | const __m128i* const xsecret = (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */ |
1376 | | const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1); |
1377 | | |
1378 | | size_t i; |
1379 | | for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { |
1380 | | /* xacc[i] ^= (xacc[i] >> 47) */ |
1381 | | __m128i const acc_vec = xacc[i]; |
1382 | | __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); |
1383 | | __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); |
1384 | | /* xacc[i] ^= xsecret; */ |
1385 | | __m128i const key_vec = _mm_loadu_si128 (xsecret+i); |
1386 | | __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); |
1387 | | |
1388 | | /* xacc[i] *= PRIME32_1; */ |
1389 | | __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31); |
1390 | | __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); |
1391 | | __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); |
1392 | | xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); |
1393 | | } |
1394 | | } |
1395 | | |
1396 | | #elif (XXPH_VECTOR == XXPH_NEON) |
1397 | | |
1398 | | XXPH_ASSERT((((size_t)acc) & 15) == 0); |
1399 | | |
1400 | | { uint64x2_t* const xacc = (uint64x2_t*) acc; |
1401 | | uint8_t const* const xsecret = (uint8_t const*) secret; |
1402 | | uint32x2_t const prime = vdup_n_u32 (PRIME32_1); |
1403 | | |
1404 | | size_t i; |
1405 | | for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) { |
1406 | | /* data_vec = xacc[i] ^ (xacc[i] >> 47); */ |
1407 | | uint64x2_t const acc_vec = xacc[i]; |
1408 | | uint64x2_t const shifted = vshrq_n_u64 (acc_vec, 47); |
1409 | | uint64x2_t const data_vec = veorq_u64 (acc_vec, shifted); |
1410 | | |
1411 | | /* key_vec = xsecret[i]; */ |
1412 | | uint32x4_t const key_vec = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16))); |
1413 | | /* data_key = data_vec ^ key_vec; */ |
1414 | | uint32x4_t const data_key = veorq_u32 (vreinterpretq_u32_u64(data_vec), key_vec); |
1415 | | /* shuffled = { data_key[0, 2], data_key[1, 3] }; */ |
1416 | | uint32x2x2_t const shuffled = vzip_u32 (vget_low_u32(data_key), vget_high_u32(data_key)); |
1417 | | |
1418 | | /* data_key *= PRIME32_1 */ |
1419 | | |
1420 | | /* prod_hi = (data_key >> 32) * PRIME32_1; */ |
1421 | | uint64x2_t const prod_hi = vmull_u32 (shuffled.val[1], prime); |
1422 | | /* xacc[i] = prod_hi << 32; */ |
1423 | | xacc[i] = vshlq_n_u64(prod_hi, 32); |
1424 | | /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */ |
1425 | | xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime); |
1426 | | } } |
1427 | | |
1428 | | #elif (XXPH_VECTOR == XXPH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5) |
1429 | | |
1430 | | U64x2* const xacc = (U64x2*) acc; |
1431 | | const U64x2* const xsecret = (const U64x2*) secret; |
1432 | | /* constants */ |
1433 | | U64x2 const v32 = { 32, 32 }; |
1434 | | U64x2 const v47 = { 47, 47 }; |
1435 | | U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 }; |
1436 | | size_t i; |
1437 | | #if XXPH_VSX_BE |
1438 | | /* endian swap */ |
1439 | | U8x16 const vXorSwap = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70, |
1440 | | 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 }; |
1441 | | #endif |
1442 | | for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) { |
1443 | | U64x2 const acc_vec = xacc[i]; |
1444 | | U64x2 const data_vec = acc_vec ^ (acc_vec >> v47); |
1445 | | /* key_vec = xsecret[i]; */ |
1446 | | #if XXPH_VSX_BE |
1447 | | /* swap bytes words */ |
1448 | | U64x2 const key_raw = vec_vsx_ld(0, xsecret + i); |
1449 | | U64x2 const data_key = (U64x2)XXPH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap); |
1450 | | #else |
1451 | | U64x2 const key_vec = vec_vsx_ld(0, xsecret + i); |
1452 | | U64x2 const data_key = data_vec ^ key_vec; |
1453 | | #endif |
1454 | | |
1455 | | /* data_key *= PRIME32_1 */ |
1456 | | |
1457 | | /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF); */ |
1458 | | U64x2 const prod_even = XXPH_vec_mule((U32x4)data_key, prime); |
1459 | | /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32); */ |
1460 | | U64x2 const prod_odd = XXPH_vec_mulo((U32x4)data_key, prime); |
1461 | | xacc[i] = prod_odd + (prod_even << v32); |
1462 | | } |
1463 | | |
1464 | | #else /* scalar variant of Scrambler - universal */ |
1465 | | |
1466 | | XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */ |
1467 | | const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ |
1468 | | size_t i; |
1469 | | XXPH_ASSERT((((size_t)acc) & (XXPH_ACC_ALIGN-1)) == 0); |
1470 | | for (i=0; i < ACC_NB; i++) { |
1471 | | xxh_u64 const key64 = XXPH_readLE64(xsecret + 8*i); |
1472 | | xxh_u64 acc64 = xacc[i]; |
1473 | | acc64 ^= acc64 >> 47; |
1474 | | acc64 ^= key64; |
1475 | | acc64 *= PRIME32_1; |
1476 | | xacc[i] = acc64; |
1477 | | } |
1478 | | |
1479 | | #endif |
1480 | 313k | } |
1481 | | |
1482 | | #define XXPH_PREFETCH_DIST 384 |
1483 | | |
1484 | | /* assumption : nbStripes will not overflow secret size */ |
1485 | | XXPH_FORCE_INLINE void |
1486 | | XXPH3_accumulate( xxh_u64* XXPH_RESTRICT acc, |
1487 | | const xxh_u8* XXPH_RESTRICT input, |
1488 | | const xxh_u8* XXPH_RESTRICT secret, |
1489 | | size_t nbStripes, |
1490 | | XXPH3_accWidth_e accWidth) |
1491 | 409k | { |
1492 | 409k | size_t n; |
1493 | 6.04M | for (n = 0; n < nbStripes; n++ ) { |
1494 | 5.63M | const xxh_u8* const in = input + n*STRIPE_LEN; |
1495 | 5.63M | XXPH_PREFETCH(in + XXPH_PREFETCH_DIST); |
1496 | 5.63M | XXPH3_accumulate_512(acc, |
1497 | 5.63M | in, |
1498 | 5.63M | secret + n*XXPH_SECRET_CONSUME_RATE, |
1499 | 5.63M | accWidth); |
1500 | 5.63M | } |
1501 | 409k | } |
1502 | | |
1503 | | /* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`, |
1504 | | * and doesn't auto-vectorize it at all if it is `FORCE_INLINE`. |
1505 | | * However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE` |
1506 | | * Pretty much every other modes and compilers prefer `FORCE_INLINE`. |
1507 | | */ |
1508 | | |
1509 | | #if defined(__clang__) && (XXPH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__) |
1510 | | static void |
1511 | | #else |
1512 | | XXPH_FORCE_INLINE void |
1513 | | #endif |
1514 | | XXPH3_hashLong_internal_loop( xxh_u64* XXPH_RESTRICT acc, |
1515 | | const xxh_u8* XXPH_RESTRICT input, size_t len, |
1516 | | const xxh_u8* XXPH_RESTRICT secret, size_t secretSize, |
1517 | | XXPH3_accWidth_e accWidth) |
1518 | 96.0k | { |
1519 | 96.0k | size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXPH_SECRET_CONSUME_RATE; |
1520 | 96.0k | size_t const block_len = STRIPE_LEN * nb_rounds; |
1521 | 96.0k | size_t const nb_blocks = len / block_len; |
1522 | | |
1523 | 96.0k | size_t n; |
1524 | | |
1525 | 96.0k | XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); |
1526 | | |
1527 | 409k | for (n = 0; n < nb_blocks; n++) { |
1528 | 313k | XXPH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth); |
1529 | 313k | XXPH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN); |
1530 | 313k | } |
1531 | | |
1532 | | /* last partial block */ |
1533 | 96.0k | XXPH_ASSERT(len > STRIPE_LEN); |
1534 | 96.0k | { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN; |
1535 | 96.0k | XXPH_ASSERT(nbStripes <= (secretSize / XXPH_SECRET_CONSUME_RATE)); |
1536 | 96.0k | XXPH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth); |
1537 | | |
1538 | | /* last stripe */ |
1539 | 96.0k | if (len & (STRIPE_LEN - 1)) { |
1540 | 94.4k | const xxh_u8* const p = input + len - STRIPE_LEN; |
1541 | 94.4k | #define XXPH_SECRET_LASTACC_START 7 /* do not align on 8, so that secret is different from scrambler */ |
1542 | 94.4k | XXPH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXPH_SECRET_LASTACC_START, accWidth); |
1543 | 94.4k | } } |
1544 | 96.0k | } |
1545 | | |
1546 | | XXPH_FORCE_INLINE xxh_u64 |
1547 | | XXPH3_mix2Accs(const xxh_u64* XXPH_RESTRICT acc, const xxh_u8* XXPH_RESTRICT secret) |
1548 | 384k | { |
1549 | 384k | return XXPH3_mul128_fold64( |
1550 | 384k | acc[0] ^ XXPH_readLE64(secret), |
1551 | 384k | acc[1] ^ XXPH_readLE64(secret+8) ); |
1552 | 384k | } |
1553 | | |
1554 | | static XXPH64_hash_t |
1555 | | XXPH3_mergeAccs(const xxh_u64* XXPH_RESTRICT acc, const xxh_u8* XXPH_RESTRICT secret, xxh_u64 start) |
1556 | 96.0k | { |
1557 | 96.0k | xxh_u64 result64 = start; |
1558 | | |
1559 | 96.0k | result64 += XXPH3_mix2Accs(acc+0, secret + 0); |
1560 | 96.0k | result64 += XXPH3_mix2Accs(acc+2, secret + 16); |
1561 | 96.0k | result64 += XXPH3_mix2Accs(acc+4, secret + 32); |
1562 | 96.0k | result64 += XXPH3_mix2Accs(acc+6, secret + 48); |
1563 | | |
1564 | 96.0k | return XXPH3_avalanche(result64); |
1565 | 96.0k | } |
1566 | | |
1567 | 96.0k | #define XXPH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \ |
1568 | 96.0k | PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 }; |
1569 | | |
1570 | | XXPH_FORCE_INLINE XXPH64_hash_t |
1571 | | XXPH3_hashLong_internal(const xxh_u8* XXPH_RESTRICT input, size_t len, |
1572 | | const xxh_u8* XXPH_RESTRICT secret, size_t secretSize) |
1573 | 96.0k | { |
1574 | 96.0k | XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXPH3_INIT_ACC; |
1575 | | |
1576 | 96.0k | XXPH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXPH3_acc_64bits); |
1577 | | |
1578 | | /* converge into final hash */ |
1579 | 96.0k | XXPH_STATIC_ASSERT(sizeof(acc) == 64); |
1580 | 96.0k | #define XXPH_SECRET_MERGEACCS_START 11 /* do not align on 8, so that secret is different from accumulator */ |
1581 | 96.0k | XXPH_ASSERT(secretSize >= sizeof(acc) + XXPH_SECRET_MERGEACCS_START); |
1582 | 96.0k | return XXPH3_mergeAccs(acc, secret + XXPH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1); |
1583 | 96.0k | } |
1584 | | |
1585 | | |
1586 | | XXPH_NO_INLINE XXPH64_hash_t /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ |
1587 | | XXPH3_hashLong_64b_defaultSecret(const xxh_u8* XXPH_RESTRICT input, size_t len) |
1588 | 52.9k | { |
1589 | 52.9k | return XXPH3_hashLong_internal(input, len, kSecret, sizeof(kSecret)); |
1590 | 52.9k | } |
1591 | | |
1592 | | XXPH_NO_INLINE XXPH64_hash_t /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ |
1593 | | XXPH3_hashLong_64b_withSecret(const xxh_u8* XXPH_RESTRICT input, size_t len, |
1594 | | const xxh_u8* XXPH_RESTRICT secret, size_t secretSize) |
1595 | 0 | { |
1596 | 0 | return XXPH3_hashLong_internal(input, len, secret, secretSize); |
1597 | 0 | } |
1598 | | |
1599 | | |
1600 | | XXPH_FORCE_INLINE void XXPH_writeLE64(void* dst, xxh_u64 v64) |
1601 | 1.03M | { |
1602 | 1.03M | if (!XXPH_CPU_LITTLE_ENDIAN) v64 = XXPH_swap64(v64); |
1603 | 1.03M | memcpy(dst, &v64, sizeof(v64)); |
1604 | 1.03M | } |
1605 | | |
1606 | | /* XXPH3_initCustomSecret() : |
1607 | | * destination `customSecret` is presumed allocated and same size as `kSecret`. |
1608 | | */ |
1609 | | XXPH_FORCE_INLINE void XXPH3_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64) |
1610 | 43.1k | { |
1611 | 43.1k | int const nbRounds = XXPH_SECRET_DEFAULT_SIZE / 16; |
1612 | 43.1k | int i; |
1613 | | |
1614 | 43.1k | XXPH_STATIC_ASSERT((XXPH_SECRET_DEFAULT_SIZE & 15) == 0); |
1615 | | |
1616 | 560k | for (i=0; i < nbRounds; i++) { |
1617 | 517k | XXPH_writeLE64(customSecret + 16*i, XXPH_readLE64(kSecret + 16*i) + seed64); |
1618 | 517k | XXPH_writeLE64(customSecret + 16*i + 8, XXPH_readLE64(kSecret + 16*i + 8) - seed64); |
1619 | 517k | } |
1620 | 43.1k | } |
1621 | | |
1622 | | |
1623 | | /* XXPH3_hashLong_64b_withSeed() : |
1624 | | * Generate a custom key, |
1625 | | * based on alteration of default kSecret with the seed, |
1626 | | * and then use this key for long mode hashing. |
1627 | | * This operation is decently fast but nonetheless costs a little bit of time. |
1628 | | * Try to avoid it whenever possible (typically when seed==0). |
1629 | | */ |
1630 | | XXPH_NO_INLINE XXPH64_hash_t /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ |
1631 | | XXPH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXPH64_hash_t seed) |
1632 | 96.0k | { |
1633 | 96.0k | XXPH_ALIGN(8) xxh_u8 secret[XXPH_SECRET_DEFAULT_SIZE]; |
1634 | 96.0k | if (seed==0) return XXPH3_hashLong_64b_defaultSecret(input, len); |
1635 | 43.1k | XXPH3_initCustomSecret(secret, seed); |
1636 | 43.1k | return XXPH3_hashLong_internal(input, len, secret, sizeof(secret)); |
1637 | 96.0k | } |
1638 | | |
1639 | | |
1640 | | XXPH_FORCE_INLINE xxh_u64 XXPH3_mix16B(const xxh_u8* XXPH_RESTRICT input, |
1641 | | const xxh_u8* XXPH_RESTRICT secret, xxh_u64 seed64) |
1642 | 3.62M | { |
1643 | 3.62M | xxh_u64 const input_lo = XXPH_readLE64(input); |
1644 | 3.62M | xxh_u64 const input_hi = XXPH_readLE64(input+8); |
1645 | 3.62M | return XXPH3_mul128_fold64( |
1646 | 3.62M | input_lo ^ (XXPH_readLE64(secret) + seed64), |
1647 | 3.62M | input_hi ^ (XXPH_readLE64(secret+8) - seed64) ); |
1648 | 3.62M | } |
1649 | | |
1650 | | |
1651 | | XXPH_FORCE_INLINE XXPH64_hash_t |
1652 | | XXPH3_len_17to128_64b(const xxh_u8* XXPH_RESTRICT input, size_t len, |
1653 | | const xxh_u8* XXPH_RESTRICT secret, size_t secretSize, |
1654 | | XXPH64_hash_t seed) |
1655 | 486k | { |
1656 | 486k | XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); (void)secretSize; |
1657 | 486k | XXPH_ASSERT(16 < len && len <= 128); |
1658 | | |
1659 | 486k | { xxh_u64 acc = len * PRIME64_1; |
1660 | 486k | if (len > 32) { |
1661 | 233k | if (len > 64) { |
1662 | 112k | if (len > 96) { |
1663 | 50.5k | acc += XXPH3_mix16B(input+48, secret+96, seed); |
1664 | 50.5k | acc += XXPH3_mix16B(input+len-64, secret+112, seed); |
1665 | 50.5k | } |
1666 | 112k | acc += XXPH3_mix16B(input+32, secret+64, seed); |
1667 | 112k | acc += XXPH3_mix16B(input+len-48, secret+80, seed); |
1668 | 112k | } |
1669 | 233k | acc += XXPH3_mix16B(input+16, secret+32, seed); |
1670 | 233k | acc += XXPH3_mix16B(input+len-32, secret+48, seed); |
1671 | 233k | } |
1672 | 486k | acc += XXPH3_mix16B(input+0, secret+0, seed); |
1673 | 486k | acc += XXPH3_mix16B(input+len-16, secret+16, seed); |
1674 | | |
1675 | 486k | return XXPH3_avalanche(acc); |
1676 | 486k | } |
1677 | 486k | } |
1678 | | |
1679 | 248k | #define XXPH3_MIDSIZE_MAX 240 |
1680 | | |
1681 | | XXPH_NO_INLINE XXPH64_hash_t |
1682 | | XXPH3_len_129to240_64b(const xxh_u8* XXPH_RESTRICT input, size_t len, |
1683 | | const xxh_u8* XXPH_RESTRICT secret, size_t secretSize, |
1684 | | XXPH64_hash_t seed) |
1685 | 152k | { |
1686 | 152k | XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); (void)secretSize; |
1687 | 152k | XXPH_ASSERT(128 < len && len <= XXPH3_MIDSIZE_MAX); |
1688 | | |
1689 | 485k | #define XXPH3_MIDSIZE_STARTOFFSET 3 |
1690 | 152k | #define XXPH3_MIDSIZE_LASTOFFSET 17 |
1691 | | |
1692 | 152k | { xxh_u64 acc = len * PRIME64_1; |
1693 | 152k | int const nbRounds = (int)len / 16; |
1694 | 152k | int i; |
1695 | 1.37M | for (i=0; i<8; i++) { |
1696 | 1.21M | acc += XXPH3_mix16B(input+(16*i), secret+(16*i), seed); |
1697 | 1.21M | } |
1698 | 152k | acc = XXPH3_avalanche(acc); |
1699 | 152k | XXPH_ASSERT(nbRounds >= 8); |
1700 | 638k | for (i=8 ; i < nbRounds; i++) { |
1701 | 485k | acc += XXPH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXPH3_MIDSIZE_STARTOFFSET, seed); |
1702 | 485k | } |
1703 | | /* last bytes */ |
1704 | 152k | acc += XXPH3_mix16B(input + len - 16, secret + XXPH3_SECRET_SIZE_MIN - XXPH3_MIDSIZE_LASTOFFSET, seed); |
1705 | 152k | return XXPH3_avalanche(acc); |
1706 | 152k | } |
1707 | 152k | } |
1708 | | |
1709 | | /* === Public entry point === */ |
1710 | | |
1711 | | XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits(const void* input, size_t len) |
1712 | 0 | { |
1713 | 0 | if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0); |
1714 | 0 | if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); |
1715 | 0 | if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); |
1716 | 0 | return XXPH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len); |
1717 | 0 | } |
1718 | | |
1719 | | XXPH_PUBLIC_API XXPH64_hash_t |
1720 | | XXPH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) |
1721 | 0 | { |
1722 | 0 | XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); |
1723 | 0 | /* if an action must be taken should `secret` conditions not be respected, |
1724 | 0 | * it should be done here. |
1725 | 0 | * For now, it's a contract pre-condition. |
1726 | 0 | * Adding a check and a branch here would cost performance at every hash */ |
1727 | 0 | if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); |
1728 | 0 | if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); |
1729 | 0 | if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); |
1730 | 0 | return XXPH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); |
1731 | 0 | } |
1732 | | |
1733 | | XXPH_PUBLIC_API XXPH64_hash_t |
1734 | | XXPH3_64bits_withSeed(const void* input, size_t len, XXPH64_hash_t seed) |
1735 | 10.2M | { |
1736 | 10.2M | if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed); |
1737 | 734k | if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); |
1738 | 248k | if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); |
1739 | 96.0k | return XXPH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed); |
1740 | 248k | } |
1741 | | |
1742 | | /* === XXPH3 streaming === */ |
1743 | | |
1744 | | /* RocksDB Note: unused & removed due to bug in preview version */ |
1745 | | |
1746 | | /*======== END #include "xxh3.h", now inlined above ==========*/ |
1747 | | |
1748 | | #endif /* XXPH_NO_LONG_LONG */ |
1749 | | |
1750 | | /* === END RocksDB modification of permanently inlining === */ |
1751 | | |
1752 | | #endif /* defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API) */ |
1753 | | |
1754 | | #endif /* XXPH_STATIC_LINKING_ONLY */ |
1755 | | |
1756 | | #if defined (__cplusplus) |
1757 | | } |
1758 | | #endif |
1759 | | |
1760 | | #endif /* XXPHASH_H_5627135585666179 */ |