/src/CMake/Utilities/cmliblzma/liblzma/common/memcmplen.h
Line | Count | Source |
1 | | // SPDX-License-Identifier: 0BSD |
2 | | |
3 | | /////////////////////////////////////////////////////////////////////////////// |
4 | | // |
5 | | /// \file memcmplen.h |
6 | | /// \brief Optimized comparison of two buffers |
7 | | // |
8 | | // Author: Lasse Collin |
9 | | // |
10 | | /////////////////////////////////////////////////////////////////////////////// |
11 | | |
12 | | #ifndef LZMA_MEMCMPLEN_H |
13 | | #define LZMA_MEMCMPLEN_H |
14 | | |
15 | | #include "common.h" |
16 | | |
17 | | #ifdef HAVE_IMMINTRIN_H |
18 | | # include <immintrin.h> |
19 | | #endif |
20 | | |
21 | | // Only include <intrin.h> if it is needed. The header is only needed |
22 | | // on Windows when using an MSVC compatible compiler. The Intel compiler |
23 | | // can use the intrinsics without the header file. |
24 | | #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ |
25 | | && defined(_MSC_VER) \ |
26 | | && (defined(_M_X64) \ |
27 | | || defined(_M_ARM64) || defined(_M_ARM64EC)) \ |
28 | | && !defined(__INTEL_COMPILER) |
29 | | # include <intrin.h> |
30 | | #endif |
31 | | |
32 | | |
33 | | /// Find out how many equal bytes the two buffers have. |
34 | | /// |
35 | | /// \param buf1 First buffer |
36 | | /// \param buf2 Second buffer |
37 | | /// \param len How many bytes have already been compared and will |
38 | | /// be assumed to match |
39 | | /// \param limit How many bytes to compare at most, including the |
40 | | /// already-compared bytes. This must be significantly |
41 | | /// smaller than UINT32_MAX to avoid integer overflows. |
42 | | /// Up to LZMA_MEMCMPLEN_EXTRA bytes may be read past |
43 | | /// the specified limit from both buf1 and buf2. |
44 | | /// |
45 | | /// \return Number of equal bytes in the buffers is returned. |
46 | | /// This is always at least len and at most limit. |
47 | | /// |
48 | | /// \note LZMA_MEMCMPLEN_EXTRA defines how many extra bytes may be read. |
49 | | /// It's rounded up to 2^n. This extra amount needs to be |
50 | | /// allocated in the buffers being used. It needs to be |
51 | | /// initialized too to keep Valgrind quiet. |
52 | | static lzma_always_inline uint32_t |
53 | | lzma_memcmplen(const uint8_t *buf1, const uint8_t *buf2, |
54 | | uint32_t len, uint32_t limit) |
55 | 0 | { |
56 | 0 | assert(len <= limit); |
57 | 0 | assert(limit <= UINT32_MAX / 2); |
58 | |
|
59 | 0 | #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ |
60 | 0 | && (((TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) \ |
61 | 0 | && (defined(__x86_64__) \ |
62 | 0 | || defined(__aarch64__))) \ |
63 | 0 | || (defined(__INTEL_COMPILER) && defined(__x86_64__)) \ |
64 | 0 | || (defined(__INTEL_COMPILER) && defined(_M_X64)) \ |
65 | 0 | || (defined(_MSC_VER) && (defined(_M_X64) \ |
66 | 0 | || defined(_M_ARM64) || defined(_M_ARM64EC)))) |
67 | | // This is only for x86-64 and ARM64 for now. This might be fine on |
68 | | // other 64-bit processors too. On big endian one should use xor |
69 | | // instead of subtraction and switch to __builtin_clzll(). |
70 | | // |
71 | | // Reasons to use subtraction instead of xor: |
72 | | // |
73 | | // - On some x86-64 processors (Intel Sandy Bridge to Tiger Lake), |
74 | | // sub+jz and sub+jnz can be fused but xor+jz or xor+jnz cannot. |
75 | | // Thus using subtraction has potential to be a tiny amount faster |
76 | | // since the code checks if the quotient is non-zero. |
77 | | // |
78 | | // - Some processors (Intel Pentium 4) used to have more ALU |
79 | | // resources for add/sub instructions than and/or/xor. |
80 | | // |
81 | | // The processor info is based on Agner Fog's microarchitecture.pdf |
82 | | // version 2023-05-26. https://www.agner.org/optimize/ |
83 | 0 | #define LZMA_MEMCMPLEN_EXTRA 8 |
84 | 0 | while (len < limit) { |
85 | 0 | const uint64_t x = read64ne(buf1 + len) - read64ne(buf2 + len); |
86 | 0 | if (x != 0) { |
87 | | // MSVC or Intel C compiler on Windows |
88 | | # if defined(_MSC_VER) || defined(__INTEL_COMPILER) |
89 | | unsigned long tmp; |
90 | | _BitScanForward64(&tmp, x); |
91 | | len += (uint32_t)tmp >> 3; |
92 | | // GCC, Clang, or Intel C compiler |
93 | | # else |
94 | 0 | len += (uint32_t)__builtin_ctzll(x) >> 3; |
95 | 0 | # endif |
96 | 0 | return my_min(len, limit); |
97 | 0 | } |
98 | | |
99 | 0 | len += 8; |
100 | 0 | } |
101 | | |
102 | 0 | return limit; |
103 | |
|
104 | | #elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ |
105 | | && defined(HAVE__MM_MOVEMASK_EPI8) \ |
106 | | && (defined(__SSE2__) \ |
107 | | || (defined(_MSC_VER) && defined(_M_IX86_FP) \ |
108 | | && _M_IX86_FP >= 2)) |
109 | | // NOTE: This will use 128-bit unaligned access which |
110 | | // TUKLIB_FAST_UNALIGNED_ACCESS wasn't meant to permit, |
111 | | // but it's convenient here since this is x86-only. |
112 | | // |
113 | | // SSE2 version for 32-bit and 64-bit x86. On x86-64 the above |
114 | | // version is sometimes significantly faster and sometimes |
115 | | // slightly slower than this SSE2 version, so this SSE2 |
116 | | // version isn't used on x86-64. |
117 | | # define LZMA_MEMCMPLEN_EXTRA 16 |
118 | | while (len < limit) { |
119 | | const uint32_t x = 0xFFFF ^ (uint32_t)_mm_movemask_epi8( |
120 | | _mm_cmpeq_epi8( |
121 | | _mm_loadu_si128((const __m128i *)(buf1 + len)), |
122 | | _mm_loadu_si128((const __m128i *)(buf2 + len)))); |
123 | | |
124 | | if (x != 0) { |
125 | | len += ctz32(x); |
126 | | return my_min(len, limit); |
127 | | } |
128 | | |
129 | | len += 16; |
130 | | } |
131 | | |
132 | | return limit; |
133 | | |
134 | | #elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && !defined(WORDS_BIGENDIAN) |
135 | | // Generic 32-bit little endian method |
136 | | # define LZMA_MEMCMPLEN_EXTRA 4 |
137 | | while (len < limit) { |
138 | | uint32_t x = read32ne(buf1 + len) - read32ne(buf2 + len); |
139 | | if (x != 0) { |
140 | | if ((x & 0xFFFF) == 0) { |
141 | | len += 2; |
142 | | x >>= 16; |
143 | | } |
144 | | |
145 | | if ((x & 0xFF) == 0) |
146 | | ++len; |
147 | | |
148 | | return my_min(len, limit); |
149 | | } |
150 | | |
151 | | len += 4; |
152 | | } |
153 | | |
154 | | return limit; |
155 | | |
156 | | #elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && defined(WORDS_BIGENDIAN) |
157 | | // Generic 32-bit big endian method |
158 | | # define LZMA_MEMCMPLEN_EXTRA 4 |
159 | | while (len < limit) { |
160 | | uint32_t x = read32ne(buf1 + len) ^ read32ne(buf2 + len); |
161 | | if (x != 0) { |
162 | | if ((x & 0xFFFF0000) == 0) { |
163 | | len += 2; |
164 | | x <<= 16; |
165 | | } |
166 | | |
167 | | if ((x & 0xFF000000) == 0) |
168 | | ++len; |
169 | | |
170 | | return my_min(len, limit); |
171 | | } |
172 | | |
173 | | len += 4; |
174 | | } |
175 | | |
176 | | return limit; |
177 | | |
178 | | #else |
179 | | // Simple portable version that doesn't use unaligned access. |
180 | | # define LZMA_MEMCMPLEN_EXTRA 0 |
181 | | while (len < limit && buf1[len] == buf2[len]) |
182 | | ++len; |
183 | | |
184 | | return len; |
185 | | #endif |
186 | 0 | } Unexecuted instantiation: lzma_encoder_optimum_fast.c:lzma_memcmplen Unexecuted instantiation: lzma_encoder_optimum_normal.c:lzma_memcmplen Unexecuted instantiation: lz_encoder.c:lzma_memcmplen Unexecuted instantiation: lz_encoder_mf.c:lzma_memcmplen |
187 | | |
188 | | #endif |