/src/xz/src/liblzma/common/memcmplen.h
Line | Count | Source |
1 | | // SPDX-License-Identifier: 0BSD |
2 | | |
3 | | /////////////////////////////////////////////////////////////////////////////// |
4 | | // |
5 | | /// \file memcmplen.h |
6 | | /// \brief Optimized comparison of two buffers |
7 | | // |
8 | | // Author: Lasse Collin |
9 | | // |
10 | | /////////////////////////////////////////////////////////////////////////////// |
11 | | |
12 | | #ifndef LZMA_MEMCMPLEN_H |
13 | | #define LZMA_MEMCMPLEN_H |
14 | | |
15 | | #include "common.h" |
16 | | |
17 | | #ifdef HAVE_IMMINTRIN_H |
18 | | # include <immintrin.h> |
19 | | #endif |
20 | | |
21 | | // Only include <intrin.h> if it is needed. The header is only needed |
22 | | // on Windows when using an MSVC compatible compiler. The Intel compiler |
23 | | // can use the intrinsics without the header file. |
24 | | #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ |
25 | | && defined(_MSC_VER) \ |
26 | | && (defined(_M_X64) \ |
27 | | || defined(_M_ARM64) || defined(_M_ARM64EC)) \ |
28 | | && !defined(__INTEL_COMPILER) |
29 | | # include <intrin.h> |
30 | | #endif |
31 | | |
32 | | |
33 | | /// Find out how many equal bytes the two buffers have. |
34 | | /// |
35 | | /// \param buf1 First buffer |
36 | | /// \param buf2 Second buffer |
37 | | /// \param len How many bytes have already been compared and will |
38 | | /// be assumed to match |
39 | | /// \param limit How many bytes to compare at most, including the |
40 | | /// already-compared bytes. This must be significantly |
41 | | /// smaller than UINT32_MAX to avoid integer overflows. |
42 | | /// Up to LZMA_MEMCMPLEN_EXTRA bytes may be read past |
43 | | /// the specified limit from both buf1 and buf2. |
44 | | /// |
45 | | /// \return Number of equal bytes in the buffers is returned. |
46 | | /// This is always at least len and at most limit. |
47 | | /// |
48 | | /// \note LZMA_MEMCMPLEN_EXTRA defines how many extra bytes may be read. |
49 | | /// It's rounded up to 2^n. This extra amount needs to be |
50 | | /// allocated in the buffers being used. It needs to be |
51 | | /// initialized too to keep Valgrind quiet. |
52 | | static lzma_always_inline uint32_t |
53 | | lzma_memcmplen(const uint8_t *buf1, const uint8_t *buf2, |
54 | | uint32_t len, uint32_t limit) |
55 | 0 | { |
56 | 0 | assert(len <= limit); |
57 | 0 | assert(limit <= UINT32_MAX / 2); |
58 | |
|
59 | 0 | #if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ |
60 | 0 | && (((TUKLIB_GNUC_REQ(3, 4) || defined(__clang__)) \ |
61 | 0 | && SIZE_MAX == UINT64_MAX) \ |
62 | 0 | || (defined(__INTEL_COMPILER) && defined(__x86_64__)) \ |
63 | 0 | || (defined(__INTEL_COMPILER) && defined(_M_X64)) \ |
64 | 0 | || (defined(_MSC_VER) && (defined(_M_X64) \ |
65 | 0 | || defined(_M_ARM64) || defined(_M_ARM64EC)))) |
66 | | // This is only for x86-64 and ARM64 for now. This might be fine on |
67 | | // other 64-bit processors too. |
68 | | // |
69 | | // Reasons to use subtraction instead of xor: |
70 | | // |
71 | | // - On some x86-64 processors (Intel Sandy Bridge to Tiger Lake), |
72 | | // sub+jz and sub+jnz can be fused but xor+jz or xor+jnz cannot. |
73 | | // Thus using subtraction has potential to be a tiny amount faster |
74 | | // since the code checks if the quotient is non-zero. |
75 | | // |
76 | | // - Some processors (Intel Pentium 4) used to have more ALU |
77 | | // resources for add/sub instructions than and/or/xor. |
78 | | // |
79 | | // The processor info is based on Agner Fog's microarchitecture.pdf |
80 | | // version 2023-05-26. https://www.agner.org/optimize/ |
81 | 0 | #define LZMA_MEMCMPLEN_EXTRA 8 |
82 | 0 | while (len < limit) { |
83 | | # ifdef WORDS_BIGENDIAN |
84 | | const uint64_t x = read64ne(buf1 + len) ^ read64ne(buf2 + len); |
85 | | # else |
86 | 0 | const uint64_t x = read64ne(buf1 + len) - read64ne(buf2 + len); |
87 | 0 | # endif |
88 | 0 | if (x != 0) { |
89 | | // MSVC or Intel C compiler on Windows |
90 | | # if defined(_MSC_VER) || defined(__INTEL_COMPILER) |
91 | | unsigned long tmp; |
92 | | _BitScanForward64(&tmp, x); |
93 | | len += (uint32_t)tmp >> 3; |
94 | | // GCC, Clang, or Intel C compiler |
95 | | # elif defined(WORDS_BIGENDIAN) |
96 | | len += (uint32_t)__builtin_clzll(x) >> 3; |
97 | | # else |
98 | 0 | len += (uint32_t)__builtin_ctzll(x) >> 3; |
99 | 0 | # endif |
100 | 0 | return my_min(len, limit); |
101 | 0 | } |
102 | | |
103 | 0 | len += 8; |
104 | 0 | } |
105 | | |
106 | 0 | return limit; |
107 | |
|
108 | | #elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ |
109 | | && defined(HAVE__MM_MOVEMASK_EPI8) \ |
110 | | && (defined(__SSE2__) \ |
111 | | || (defined(_MSC_VER) && defined(_M_IX86_FP) \ |
112 | | && _M_IX86_FP >= 2)) |
113 | | // NOTE: This will use 128-bit unaligned access which |
114 | | // TUKLIB_FAST_UNALIGNED_ACCESS wasn't meant to permit, |
115 | | // but it's convenient here since this is x86-only. |
116 | | // |
117 | | // SSE2 version for 32-bit and 64-bit x86. On x86-64 the above |
118 | | // version is sometimes significantly faster and sometimes |
119 | | // slightly slower than this SSE2 version, so this SSE2 |
120 | | // version isn't used on x86-64. |
121 | | # define LZMA_MEMCMPLEN_EXTRA 16 |
122 | | while (len < limit) { |
123 | | const uint32_t x = 0xFFFF ^ (uint32_t)_mm_movemask_epi8( |
124 | | _mm_cmpeq_epi8( |
125 | | _mm_loadu_si128((const __m128i *)(buf1 + len)), |
126 | | _mm_loadu_si128((const __m128i *)(buf2 + len)))); |
127 | | |
128 | | if (x != 0) { |
129 | | len += ctz32(x); |
130 | | return my_min(len, limit); |
131 | | } |
132 | | |
133 | | len += 16; |
134 | | } |
135 | | |
136 | | return limit; |
137 | | |
138 | | #elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && !defined(WORDS_BIGENDIAN) |
139 | | // Generic 32-bit little endian method |
140 | | # define LZMA_MEMCMPLEN_EXTRA 4 |
141 | | while (len < limit) { |
142 | | uint32_t x = read32ne(buf1 + len) - read32ne(buf2 + len); |
143 | | if (x != 0) { |
144 | | if ((x & 0xFFFF) == 0) { |
145 | | len += 2; |
146 | | x >>= 16; |
147 | | } |
148 | | |
149 | | if ((x & 0xFF) == 0) |
150 | | ++len; |
151 | | |
152 | | return my_min(len, limit); |
153 | | } |
154 | | |
155 | | len += 4; |
156 | | } |
157 | | |
158 | | return limit; |
159 | | |
160 | | #elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && defined(WORDS_BIGENDIAN) |
161 | | // Generic 32-bit big endian method |
162 | | # define LZMA_MEMCMPLEN_EXTRA 4 |
163 | | while (len < limit) { |
164 | | uint32_t x = read32ne(buf1 + len) ^ read32ne(buf2 + len); |
165 | | if (x != 0) { |
166 | | if ((x & 0xFFFF0000) == 0) { |
167 | | len += 2; |
168 | | x <<= 16; |
169 | | } |
170 | | |
171 | | if ((x & 0xFF000000) == 0) |
172 | | ++len; |
173 | | |
174 | | return my_min(len, limit); |
175 | | } |
176 | | |
177 | | len += 4; |
178 | | } |
179 | | |
180 | | return limit; |
181 | | |
182 | | #else |
183 | | // Simple portable version that doesn't use unaligned access. |
184 | | # define LZMA_MEMCMPLEN_EXTRA 0 |
185 | | while (len < limit && buf1[len] == buf2[len]) |
186 | | ++len; |
187 | | |
188 | | return len; |
189 | | #endif |
190 | 0 | } Unexecuted instantiation: lz_encoder.c:lzma_memcmplen Unexecuted instantiation: lz_encoder_mf.c:lzma_memcmplen Unexecuted instantiation: lzma_encoder_optimum_fast.c:lzma_memcmplen Unexecuted instantiation: lzma_encoder_optimum_normal.c:lzma_memcmplen |
191 | | |
192 | | #endif |