Coverage Report

Created: 2025-11-16 07:22

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/xz/src/liblzma/check/crc_x86_clmul.h
Line
Count
Source
1
// SPDX-License-Identifier: 0BSD
2
3
///////////////////////////////////////////////////////////////////////////////
4
//
5
/// \file       crc_x86_clmul.h
6
/// \brief      CRC32 and CRC64 implementations using CLMUL instructions.
7
///
8
/// The CRC32 and CRC64 implementations use 32/64-bit x86 SSSE3, SSE4.1, and
9
/// CLMUL instructions. This is compatible with Elbrus 2000 (E2K) too.
10
///
11
/// See the Intel white paper "Fast CRC Computation for Generic Polynomials
12
/// Using PCLMULQDQ Instruction" from 2009. The original file seems to be
13
/// gone from Intel's website but a version is available here:
14
/// https://www.researchgate.net/publication/263424619_Fast_CRC_computation
15
/// (The link was checked on 2024-06-11.)
16
///
17
/// While this file has both CRC32 and CRC64 implementations, only one
18
/// can be built at a time. The version to build is selected by defining
19
/// BUILDING_CRC_CLMUL to 32 or 64 before including this file.
20
///
21
/// NOTE: The x86 CLMUL CRC implementation was rewritten for XZ Utils 5.8.0.
22
//
23
//  Authors:    Lasse Collin
24
//              Ilya Kurdyukov
25
//
26
///////////////////////////////////////////////////////////////////////////////
27
28
// This file must not be included more than once.
29
#ifdef LZMA_CRC_X86_CLMUL_H
30
# error crc_x86_clmul.h was included twice.
31
#endif
32
#define LZMA_CRC_X86_CLMUL_H
33
34
#if BUILDING_CRC_CLMUL != 32 && BUILDING_CRC_CLMUL != 64
35
# error BUILDING_CRC_CLMUL is undefined or has an invalid value
36
#endif
37
38
#include <immintrin.h>
39
40
#if defined(_MSC_VER)
41
# include <intrin.h>
42
#elif defined(HAVE_CPUID_H)
43
# include <cpuid.h>
44
#endif
45
46
47
// EDG-based compilers (Intel's classic compiler and compiler for E2K) can
48
// define __GNUC__ but the attribute must not be used with them.
49
// The new Clang-based ICX needs the attribute.
50
//
51
// NOTE: Build systems check for this too, keep them in sync with this.
52
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
53
# define crc_attr_target \
54
    __attribute__((__target__("ssse3,sse4.1,pclmul")))
55
#else
56
# define crc_attr_target
57
#endif
58
59
60
// GCC and Clang would produce good code with _mm_set_epi64x
61
// but MSVC needs _mm_cvtsi64_si128 on x86-64.
62
#if defined(__i386__) || defined(_M_IX86)
63
# define my_set_low64(a) _mm_set_epi64x(0, (a))
64
#else
65
59.0k
# define my_set_low64(a) _mm_cvtsi64_si128(a)
66
#endif
67
68
69
// Align it so that the whole array is within the same cache line.
70
// More than one unaligned load can be done from this during the
71
// same CRC function call.
72
//
73
// The bytes [0] to [31] are used with AND to clear the low bytes. (With ANDN
74
// those could be used to clear the high bytes too but it's not needed here.)
75
//
76
// The bytes [16] to [47] are for left shifts.
77
// The bytes [32] to [63] are for right shifts.
78
alignas(64)
79
static uint8_t vmasks[64] = {
80
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
81
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
82
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
83
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
84
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
85
  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
86
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
87
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
88
};
89
90
91
// *Unaligned* 128-bit load
92
crc_attr_target
93
static inline __m128i
94
my_load128(const uint8_t *p)
95
1.09M
{
96
1.09M
  return _mm_loadu_si128((const __m128i *)p);
97
1.09M
}
crc32_fast.c:my_load128
Line
Count
Source
95
930k
{
96
930k
  return _mm_loadu_si128((const __m128i *)p);
97
930k
}
crc64_fast.c:my_load128
Line
Count
Source
95
166k
{
96
166k
  return _mm_loadu_si128((const __m128i *)p);
97
166k
}
98
99
100
// Keep the highest "count" bytes as is and clear the remaining low bytes.
101
crc_attr_target
102
static inline __m128i
103
keep_high_bytes(__m128i v, size_t count)
104
14.1k
{
105
14.1k
  return _mm_and_si128(my_load128((vmasks + count)), v);
106
14.1k
}
crc32_fast.c:keep_high_bytes
Line
Count
Source
104
12.5k
{
105
12.5k
  return _mm_and_si128(my_load128((vmasks + count)), v);
106
12.5k
}
crc64_fast.c:keep_high_bytes
Line
Count
Source
104
1.57k
{
105
1.57k
  return _mm_and_si128(my_load128((vmasks + count)), v);
106
1.57k
}
107
108
109
// Shift the 128-bit value left by "amount" bytes (not bits).
110
crc_attr_target
111
static inline __m128i
112
shift_left(__m128i v, size_t amount)
113
23.1k
{
114
23.1k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 - amount)));
115
23.1k
}
crc32_fast.c:shift_left
Line
Count
Source
113
21.5k
{
114
21.5k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 - amount)));
115
21.5k
}
crc64_fast.c:shift_left
Line
Count
Source
113
1.59k
{
114
1.59k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 - amount)));
115
1.59k
}
116
117
118
// Shift the 128-bit value right by "amount" bytes (not bits).
119
crc_attr_target
120
static inline __m128i
121
shift_right(__m128i v, size_t amount)
122
14.1k
{
123
14.1k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 + amount)));
124
14.1k
}
crc32_fast.c:shift_right
Line
Count
Source
122
12.5k
{
123
12.5k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 + amount)));
124
12.5k
}
crc64_fast.c:shift_right
Line
Count
Source
122
1.57k
{
123
1.57k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 + amount)));
124
1.57k
}
125
126
127
crc_attr_target
128
static inline __m128i
129
fold(__m128i v, __m128i k)
130
998k
{
131
998k
  __m128i a = _mm_clmulepi64_si128(v, k, 0x00);
132
998k
  __m128i b = _mm_clmulepi64_si128(v, k, 0x11);
133
998k
  return _mm_xor_si128(a, b);
134
998k
}
crc32_fast.c:fold
Line
Count
Source
130
840k
{
131
840k
  __m128i a = _mm_clmulepi64_si128(v, k, 0x00);
132
  __m128i b = _mm_clmulepi64_si128(v, k, 0x11);
133
840k
  return _mm_xor_si128(a, b);
134
840k
}
crc64_fast.c:fold
Line
Count
Source
130
158k
{
131
158k
  __m128i a = _mm_clmulepi64_si128(v, k, 0x00);
132
  __m128i b = _mm_clmulepi64_si128(v, k, 0x11);
133
158k
  return _mm_xor_si128(a, b);
134
158k
}
135
136
137
crc_attr_target
138
static inline __m128i
139
fold_xor(__m128i v, __m128i k, const uint8_t *buf)
140
845k
{
141
845k
  return _mm_xor_si128(my_load128(buf), fold(v, k));
142
845k
}
crc32_fast.c:fold_xor
Line
Count
Source
140
699k
{
141
699k
  return _mm_xor_si128(my_load128(buf), fold(v, k));
142
699k
}
crc64_fast.c:fold_xor
Line
Count
Source
140
145k
{
141
145k
  return _mm_xor_si128(my_load128(buf), fold(v, k));
142
145k
}
143
144
145
#if BUILDING_CRC_CLMUL == 32
146
crc_attr_target
147
static uint32_t
148
crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
149
#else
150
crc_attr_target
151
static uint64_t
152
crc64_arch_optimized(const uint8_t *buf, size_t size, uint64_t crc)
153
#endif
154
59.0k
{
155
  // We will assume that there is at least one byte of input.
156
59.0k
  if (size == 0)
157
0
    return crc;
158
159
  // See crc_clmul_consts_gen.c.
160
#if BUILDING_CRC_CLMUL == 32
161
55.3k
  const __m128i fold512 = _mm_set_epi64x(0x1d9513d7, 0x8f352d95);
162
55.3k
  const __m128i fold128 = _mm_set_epi64x(0xccaa009e, 0xae689191);
163
55.3k
  const __m128i mu_p = _mm_set_epi64x(
164
55.3k
    (int64_t)0xb4e5b025f7011641, 0x1db710640);
165
#else
166
3.64k
  const __m128i fold512 = _mm_set_epi64x(
167
3.64k
    (int64_t)0x081f6054a7842df4, (int64_t)0x6ae3efbb9dd441f3);
168
169
  const __m128i fold128 = _mm_set_epi64x(
170
    (int64_t)0xdabe95afc7875f40, (int64_t)0xe05dd497ca393ae4);
171
172
3.64k
  const __m128i mu_p = _mm_set_epi64x(
173
3.64k
    (int64_t)0x9c3e466c172963d5, (int64_t)0x92d8af2baf0e1e84);
174
3.64k
#endif
175
176
3.64k
  __m128i v0, v1, v2, v3;
177
178
3.64k
  crc = ~crc;
179
180
59.0k
  if (size < 8) {
181
7.14k
    uint64_t x = crc;
182
7.14k
    size_t i = 0;
183
184
    // Checking the bit instead of comparing the size means
185
    // that we don't need to update the size between the steps.
186
7.14k
    if (size & 4) {
187
1.71k
      x ^= read32le(buf);
188
1.71k
      buf += 4;
189
1.71k
      i = 32;
190
1.71k
    }
191
192
7.14k
    if (size & 2) {
193
6.94k
      x ^= (uint64_t)read16le(buf) << i;
194
6.94k
      buf += 2;
195
6.94k
      i += 16;
196
6.94k
    }
197
198
7.14k
    if (size & 1)
199
34
      x ^= (uint64_t)*buf << i;
200
201
7.14k
    v0 = my_set_low64((int64_t)x);
202
7.14k
    v0 = shift_left(v0, 8 - size);
203
204
51.8k
  } else if (size < 16) {
205
4.78k
    v0 = my_set_low64((int64_t)(crc ^ read64le(buf)));
206
207
    // NOTE: buf is intentionally left 8 bytes behind so that
208
    // we can read the last 1-7 bytes with read64le(buf + size).
209
4.78k
    size -= 8;
210
211
    // Handling 8-byte input specially is a speed optimization
212
    // as the clmul can be skipped. A branch is also needed to
213
    // avoid a too high shift amount.
214
4.78k
    if (size > 0) {
215
1.86k
      const size_t padding = 8 - size;
216
1.86k
      uint64_t high = read64le(buf + size) >> (padding * 8);
217
218
#if defined(__i386__) || defined(_M_IX86)
219
      // Simple but likely not the best code for 32-bit x86.
220
      v0 = _mm_insert_epi32(v0, (int32_t)high, 2);
221
      v0 = _mm_insert_epi32(v0, (int32_t)(high >> 32), 3);
222
#else
223
1.86k
      v0 = _mm_insert_epi64(v0, (int64_t)high, 1);
224
1.86k
#endif
225
226
1.86k
      v0 = shift_left(v0, padding);
227
228
1.86k
      v1 = _mm_srli_si128(v0, 8);
229
1.86k
      v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
230
1.86k
      v0 = _mm_xor_si128(v0, v1);
231
1.86k
    }
232
47.0k
  } else {
233
47.0k
    v0 = my_set_low64((int64_t)crc);
234
235
    // To align or not to align the buf pointer? If the end of
236
    // the buffer isn't aligned, aligning the pointer here would
237
    // make us do an extra folding step with the associated byte
238
    // shuffling overhead. The cost of that would need to be
239
    // lower than the benefit of aligned reads. Testing on an old
240
    // Intel Ivy Bridge processor suggested that aligning isn't
241
    // worth the cost but it likely depends on the processor and
242
    // buffer size. Unaligned loads (MOVDQU) should be fast on
243
    // x86 processors that support PCLMULQDQ, so we don't align
244
    // the buf pointer here.
245
246
    // Read the first (and possibly the only) full 16 bytes.
247
47.0k
    v0 = _mm_xor_si128(v0, my_load128(buf));
248
47.0k
    buf += 16;
249
47.0k
    size -= 16;
250
251
47.0k
    if (size >= 48) {
252
46.1k
      v1 = my_load128(buf);
253
46.1k
      v2 = my_load128(buf + 16);
254
46.1k
      v3 = my_load128(buf + 32);
255
46.1k
      buf += 48;
256
46.1k
      size -= 48;
257
258
242k
      while (size >= 64) {
259
195k
        v0 = fold_xor(v0, fold512, buf);
260
195k
        v1 = fold_xor(v1, fold512, buf + 16);
261
195k
        v2 = fold_xor(v2, fold512, buf + 32);
262
195k
        v3 = fold_xor(v3, fold512, buf + 48);
263
195k
        buf += 64;
264
195k
        size -= 64;
265
195k
      }
266
267
46.1k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
268
46.1k
      v0 = _mm_xor_si128(v2, fold(v0, fold128));
269
46.1k
      v0 = _mm_xor_si128(v3, fold(v0, fold128));
270
46.1k
    }
271
272
109k
    while (size >= 16) {
273
62.2k
      v0 = fold_xor(v0, fold128, buf);
274
62.2k
      buf += 16;
275
62.2k
      size -= 16;
276
62.2k
    }
277
278
47.0k
    if (size > 0) {
279
      // We want the last "size" number of input bytes to
280
      // be at the high bits of v1. First do a full 16-byte
281
      // load and then mask the low bytes to zeros.
282
14.1k
      v1 = my_load128(buf + size - 16);
283
14.1k
      v1 = keep_high_bytes(v1, size);
284
285
      // Shift high bytes from v0 to the low bytes of v1.
286
      //
287
      // Alternatively we could replace the combination
288
      // keep_high_bytes + shift_right + _mm_or_si128 with
289
      // _mm_shuffle_epi8 + _mm_blendv_epi8 but that would
290
      // require larger tables for the masks. Now there are
291
      // three loads (instead of two) from the mask tables
292
      // but they all are from the same cache line.
293
14.1k
      v1 = _mm_or_si128(v1, shift_right(v0, size));
294
295
      // Shift high bytes of v0 away, padding the
296
      // low bytes with zeros.
297
14.1k
      v0 = shift_left(v0, 16 - size);
298
299
14.1k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
300
14.1k
    }
301
302
47.0k
    v1 = _mm_srli_si128(v0, 8);
303
47.0k
    v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
304
47.0k
    v0 = _mm_xor_si128(v0, v1);
305
47.0k
  }
306
307
  // Barrett reduction
308
309
#if BUILDING_CRC_CLMUL == 32
310
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
311
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
312
  v0 = _mm_xor_si128(v0, v1);
313
  return ~(uint32_t)_mm_extract_epi32(v0, 2);
314
#else
315
  // Because p is 65 bits but one bit doesn't fit into the 64-bit
316
  // half of __m128i, finish the second clmul by shifting v1 left
317
  // by 64 bits and xorring it to the final result.
318
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
319
  v2 = _mm_slli_si128(v1, 8);
320
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
321
  v0 = _mm_xor_si128(v0, v2);
322
  v0 = _mm_xor_si128(v0, v1);
323
#if defined(__i386__) || defined(_M_IX86)
324
  return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
325
      (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
326
#else
327
  return ~(uint64_t)_mm_extract_epi64(v0, 1);
328
#endif
329
#endif
330
59.0k
}
crc32_fast.c:crc32_arch_optimized
Line
Count
Source
154
55.3k
{
155
  // We will assume that there is at least one byte of input.
156
55.3k
  if (size == 0)
157
0
    return crc;
158
159
  // See crc_clmul_consts_gen.c.
160
55.3k
#if BUILDING_CRC_CLMUL == 32
161
55.3k
  const __m128i fold512 = _mm_set_epi64x(0x1d9513d7, 0x8f352d95);
162
55.3k
  const __m128i fold128 = _mm_set_epi64x(0xccaa009e, 0xae689191);
163
55.3k
  const __m128i mu_p = _mm_set_epi64x(
164
55.3k
    (int64_t)0xb4e5b025f7011641, 0x1db710640);
165
#else
166
  const __m128i fold512 = _mm_set_epi64x(
167
    (int64_t)0x081f6054a7842df4, (int64_t)0x6ae3efbb9dd441f3);
168
169
  const __m128i fold128 = _mm_set_epi64x(
170
    (int64_t)0xdabe95afc7875f40, (int64_t)0xe05dd497ca393ae4);
171
172
  const __m128i mu_p = _mm_set_epi64x(
173
    (int64_t)0x9c3e466c172963d5, (int64_t)0x92d8af2baf0e1e84);
174
#endif
175
176
55.3k
  __m128i v0, v1, v2, v3;
177
178
55.3k
  crc = ~crc;
179
180
55.3k
  if (size < 8) {
181
7.12k
    uint64_t x = crc;
182
7.12k
    size_t i = 0;
183
184
    // Checking the bit instead of comparing the size means
185
    // that we don't need to update the size between the steps.
186
7.12k
    if (size & 4) {
187
1.71k
      x ^= read32le(buf);
188
1.71k
      buf += 4;
189
1.71k
      i = 32;
190
1.71k
    }
191
192
7.12k
    if (size & 2) {
193
6.93k
      x ^= (uint64_t)read16le(buf) << i;
194
6.93k
      buf += 2;
195
6.93k
      i += 16;
196
6.93k
    }
197
198
7.12k
    if (size & 1)
199
26
      x ^= (uint64_t)*buf << i;
200
201
7.12k
    v0 = my_set_low64((int64_t)x);
202
7.12k
    v0 = shift_left(v0, 8 - size);
203
204
48.2k
  } else if (size < 16) {
205
4.78k
    v0 = my_set_low64((int64_t)(crc ^ read64le(buf)));
206
207
    // NOTE: buf is intentionally left 8 bytes behind so that
208
    // we can read the last 1-7 bytes with read64le(buf + size).
209
4.78k
    size -= 8;
210
211
    // Handling 8-byte input specially is a speed optimization
212
    // as the clmul can be skipped. A branch is also needed to
213
    // avoid a too high shift amount.
214
4.78k
    if (size > 0) {
215
1.85k
      const size_t padding = 8 - size;
216
1.85k
      uint64_t high = read64le(buf + size) >> (padding * 8);
217
218
#if defined(__i386__) || defined(_M_IX86)
219
      // Simple but likely not the best code for 32-bit x86.
220
      v0 = _mm_insert_epi32(v0, (int32_t)high, 2);
221
      v0 = _mm_insert_epi32(v0, (int32_t)(high >> 32), 3);
222
#else
223
1.85k
      v0 = _mm_insert_epi64(v0, (int64_t)high, 1);
224
1.85k
#endif
225
226
1.85k
      v0 = shift_left(v0, padding);
227
228
1.85k
      v1 = _mm_srli_si128(v0, 8);
229
1.85k
      v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
230
1.85k
      v0 = _mm_xor_si128(v0, v1);
231
1.85k
    }
232
43.4k
  } else {
233
43.4k
    v0 = my_set_low64((int64_t)crc);
234
235
    // To align or not to align the buf pointer? If the end of
236
    // the buffer isn't aligned, aligning the pointer here would
237
    // make us do an extra folding step with the associated byte
238
    // shuffling overhead. The cost of that would need to be
239
    // lower than the benefit of aligned reads. Testing on an old
240
    // Intel Ivy Bridge processor suggested that aligning isn't
241
    // worth the cost but it likely depends on the processor and
242
    // buffer size. Unaligned loads (MOVDQU) should be fast on
243
    // x86 processors that support PCLMULQDQ, so we don't align
244
    // the buf pointer here.
245
246
    // Read the first (and possibly the only) full 16 bytes.
247
43.4k
    v0 = _mm_xor_si128(v0, my_load128(buf));
248
43.4k
    buf += 16;
249
43.4k
    size -= 16;
250
251
43.4k
    if (size >= 48) {
252
42.5k
      v1 = my_load128(buf);
253
42.5k
      v2 = my_load128(buf + 16);
254
42.5k
      v3 = my_load128(buf + 32);
255
42.5k
      buf += 48;
256
42.5k
      size -= 48;
257
258
203k
      while (size >= 64) {
259
160k
        v0 = fold_xor(v0, fold512, buf);
260
160k
        v1 = fold_xor(v1, fold512, buf + 16);
261
160k
        v2 = fold_xor(v2, fold512, buf + 32);
262
160k
        v3 = fold_xor(v3, fold512, buf + 48);
263
160k
        buf += 64;
264
160k
        size -= 64;
265
160k
      }
266
267
42.5k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
268
42.5k
      v0 = _mm_xor_si128(v2, fold(v0, fold128));
269
42.5k
      v0 = _mm_xor_si128(v3, fold(v0, fold128));
270
42.5k
    }
271
272
100k
    while (size >= 16) {
273
57.5k
      v0 = fold_xor(v0, fold128, buf);
274
57.5k
      buf += 16;
275
57.5k
      size -= 16;
276
57.5k
    }
277
278
43.4k
    if (size > 0) {
279
      // We want the last "size" number of input bytes to
280
      // be at the high bits of v1. First do a full 16-byte
281
      // load and then mask the low bytes to zeros.
282
12.5k
      v1 = my_load128(buf + size - 16);
283
12.5k
      v1 = keep_high_bytes(v1, size);
284
285
      // Shift high bytes from v0 to the low bytes of v1.
286
      //
287
      // Alternatively we could replace the combination
288
      // keep_high_bytes + shift_right + _mm_or_si128 with
289
      // _mm_shuffle_epi8 + _mm_blendv_epi8 but that would
290
      // require larger tables for the masks. Now there are
291
      // three loads (instead of two) from the mask tables
292
      // but they all are from the same cache line.
293
12.5k
      v1 = _mm_or_si128(v1, shift_right(v0, size));
294
295
      // Shift high bytes of v0 away, padding the
296
      // low bytes with zeros.
297
12.5k
      v0 = shift_left(v0, 16 - size);
298
299
12.5k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
300
12.5k
    }
301
302
43.4k
    v1 = _mm_srli_si128(v0, 8);
303
43.4k
    v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
304
43.4k
    v0 = _mm_xor_si128(v0, v1);
305
43.4k
  }
306
307
  // Barrett reduction
308
309
55.3k
#if BUILDING_CRC_CLMUL == 32
310
55.3k
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
311
55.3k
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
312
55.3k
  v0 = _mm_xor_si128(v0, v1);
313
55.3k
  return ~(uint32_t)_mm_extract_epi32(v0, 2);
314
#else
315
  // Because p is 65 bits but one bit doesn't fit into the 64-bit
316
  // half of __m128i, finish the second clmul by shifting v1 left
317
  // by 64 bits and xorring it to the final result.
318
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
319
  v2 = _mm_slli_si128(v1, 8);
320
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
321
  v0 = _mm_xor_si128(v0, v2);
322
  v0 = _mm_xor_si128(v0, v1);
323
#if defined(__i386__) || defined(_M_IX86)
324
  return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
325
      (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
326
#else
327
  return ~(uint64_t)_mm_extract_epi64(v0, 1);
328
#endif
329
#endif
330
55.3k
}
crc64_fast.c:crc64_arch_optimized
Line
Count
Source
154
3.64k
{
155
  // We will assume that there is at least one byte of input.
156
3.64k
  if (size == 0)
157
0
    return crc;
158
159
  // See crc_clmul_consts_gen.c.
160
#if BUILDING_CRC_CLMUL == 32
161
  const __m128i fold512 = _mm_set_epi64x(0x1d9513d7, 0x8f352d95);
162
  const __m128i fold128 = _mm_set_epi64x(0xccaa009e, 0xae689191);
163
  const __m128i mu_p = _mm_set_epi64x(
164
    (int64_t)0xb4e5b025f7011641, 0x1db710640);
165
#else
166
3.64k
  const __m128i fold512 = _mm_set_epi64x(
167
3.64k
    (int64_t)0x081f6054a7842df4, (int64_t)0x6ae3efbb9dd441f3);
168
169
3.64k
  const __m128i fold128 = _mm_set_epi64x(
170
3.64k
    (int64_t)0xdabe95afc7875f40, (int64_t)0xe05dd497ca393ae4);
171
172
3.64k
  const __m128i mu_p = _mm_set_epi64x(
173
3.64k
    (int64_t)0x9c3e466c172963d5, (int64_t)0x92d8af2baf0e1e84);
174
3.64k
#endif
175
176
3.64k
  __m128i v0, v1, v2, v3;
177
178
3.64k
  crc = ~crc;
179
180
3.64k
  if (size < 8) {
181
18
    uint64_t x = crc;
182
18
    size_t i = 0;
183
184
    // Checking the bit instead of comparing the size means
185
    // that we don't need to update the size between the steps.
186
18
    if (size & 4) {
187
7
      x ^= read32le(buf);
188
7
      buf += 4;
189
7
      i = 32;
190
7
    }
191
192
18
    if (size & 2) {
193
13
      x ^= (uint64_t)read16le(buf) << i;
194
13
      buf += 2;
195
13
      i += 16;
196
13
    }
197
198
18
    if (size & 1)
199
8
      x ^= (uint64_t)*buf << i;
200
201
18
    v0 = my_set_low64((int64_t)x);
202
18
    v0 = shift_left(v0, 8 - size);
203
204
3.62k
  } else if (size < 16) {
205
9
    v0 = my_set_low64((int64_t)(crc ^ read64le(buf)));
206
207
    // NOTE: buf is intentionally left 8 bytes behind so that
208
    // we can read the last 1-7 bytes with read64le(buf + size).
209
9
    size -= 8;
210
211
    // Handling 8-byte input specially is a speed optimization
212
    // as the clmul can be skipped. A branch is also needed to
213
    // avoid a too high shift amount.
214
9
    if (size > 0) {
215
7
      const size_t padding = 8 - size;
216
7
      uint64_t high = read64le(buf + size) >> (padding * 8);
217
218
#if defined(__i386__) || defined(_M_IX86)
219
      // Simple but likely not the best code for 32-bit x86.
220
      v0 = _mm_insert_epi32(v0, (int32_t)high, 2);
221
      v0 = _mm_insert_epi32(v0, (int32_t)(high >> 32), 3);
222
#else
223
7
      v0 = _mm_insert_epi64(v0, (int64_t)high, 1);
224
7
#endif
225
226
7
      v0 = shift_left(v0, padding);
227
228
7
      v1 = _mm_srli_si128(v0, 8);
229
7
      v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
230
7
      v0 = _mm_xor_si128(v0, v1);
231
7
    }
232
3.61k
  } else {
233
3.61k
    v0 = my_set_low64((int64_t)crc);
234
235
    // To align or not to align the buf pointer? If the end of
236
    // the buffer isn't aligned, aligning the pointer here would
237
    // make us do an extra folding step with the associated byte
238
    // shuffling overhead. The cost of that would need to be
239
    // lower than the benefit of aligned reads. Testing on an old
240
    // Intel Ivy Bridge processor suggested that aligning isn't
241
    // worth the cost but it likely depends on the processor and
242
    // buffer size. Unaligned loads (MOVDQU) should be fast on
243
    // x86 processors that support PCLMULQDQ, so we don't align
244
    // the buf pointer here.
245
246
    // Read the first (and possibly the only) full 16 bytes.
247
3.61k
    v0 = _mm_xor_si128(v0, my_load128(buf));
248
3.61k
    buf += 16;
249
3.61k
    size -= 16;
250
251
3.61k
    if (size >= 48) {
252
3.56k
      v1 = my_load128(buf);
253
3.56k
      v2 = my_load128(buf + 16);
254
3.56k
      v3 = my_load128(buf + 32);
255
3.56k
      buf += 48;
256
3.56k
      size -= 48;
257
258
38.8k
      while (size >= 64) {
259
35.3k
        v0 = fold_xor(v0, fold512, buf);
260
35.3k
        v1 = fold_xor(v1, fold512, buf + 16);
261
35.3k
        v2 = fold_xor(v2, fold512, buf + 32);
262
35.3k
        v3 = fold_xor(v3, fold512, buf + 48);
263
35.3k
        buf += 64;
264
35.3k
        size -= 64;
265
35.3k
      }
266
267
3.56k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
268
3.56k
      v0 = _mm_xor_si128(v2, fold(v0, fold128));
269
3.56k
      v0 = _mm_xor_si128(v3, fold(v0, fold128));
270
3.56k
    }
271
272
8.28k
    while (size >= 16) {
273
4.66k
      v0 = fold_xor(v0, fold128, buf);
274
4.66k
      buf += 16;
275
4.66k
      size -= 16;
276
4.66k
    }
277
278
3.61k
    if (size > 0) {
279
      // We want the last "size" number of input bytes to
280
      // be at the high bits of v1. First do a full 16-byte
281
      // load and then mask the low bytes to zeros.
282
1.57k
      v1 = my_load128(buf + size - 16);
283
1.57k
      v1 = keep_high_bytes(v1, size);
284
285
      // Shift high bytes from v0 to the low bytes of v1.
286
      //
287
      // Alternatively we could replace the combination
288
      // keep_high_bytes + shift_right + _mm_or_si128 with
289
      // _mm_shuffle_epi8 + _mm_blendv_epi8 but that would
290
      // require larger tables for the masks. Now there are
291
      // three loads (instead of two) from the mask tables
292
      // but they all are from the same cache line.
293
1.57k
      v1 = _mm_or_si128(v1, shift_right(v0, size));
294
295
      // Shift high bytes of v0 away, padding the
296
      // low bytes with zeros.
297
1.57k
      v0 = shift_left(v0, 16 - size);
298
299
1.57k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
300
1.57k
    }
301
302
3.61k
    v1 = _mm_srli_si128(v0, 8);
303
3.61k
    v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
304
3.61k
    v0 = _mm_xor_si128(v0, v1);
305
3.61k
  }
306
307
  // Barrett reduction
308
309
#if BUILDING_CRC_CLMUL == 32
310
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
311
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
312
  v0 = _mm_xor_si128(v0, v1);
313
  return ~(uint32_t)_mm_extract_epi32(v0, 2);
314
#else
315
  // Because p is 65 bits but one bit doesn't fit into the 64-bit
316
  // half of __m128i, finish the second clmul by shifting v1 left
317
  // by 64 bits and xorring it to the final result.
318
3.64k
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
319
3.64k
  v2 = _mm_slli_si128(v1, 8);
320
3.64k
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
321
3.64k
  v0 = _mm_xor_si128(v0, v2);
322
3.64k
  v0 = _mm_xor_si128(v0, v1);
323
#if defined(__i386__) || defined(_M_IX86)
324
  return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
325
      (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
326
#else
327
  return ~(uint64_t)_mm_extract_epi64(v0, 1);
328
3.64k
#endif
329
3.64k
#endif
330
3.64k
}
331
332
333
// Even though this is an inline function, compile it only when needed.
334
// This way it won't appear in E2K builds at all.
335
#if defined(CRC32_GENERIC) || defined(CRC64_GENERIC)
336
// Inlining this function duplicates the function body in crc32_resolve() and
337
// crc64_resolve(), but this is acceptable because this is a tiny function.
338
static inline bool
339
is_arch_extension_supported(void)
340
504
{
341
504
  int success = 1;
342
504
  uint32_t r[4]; // eax, ebx, ecx, edx
343
344
#if defined(_MSC_VER)
345
  // This needs <intrin.h> with MSVC. ICC has it as a built-in
346
  // on all platforms.
347
  __cpuid(r, 1);
348
#elif defined(HAVE_CPUID_H)
349
  // Compared to just using __asm__ to run CPUID, this also checks
350
  // that CPUID is supported and saves and restores ebx as that is
351
  // needed with GCC < 5 with position-independent code (PIC).
352
504
  success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
353
#else
354
  // Just a fallback that shouldn't be needed.
355
  __asm__("cpuid\n\t"
356
      : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
357
      : "a"(1), "c"(0));
358
#endif
359
360
  // Returns true if these are supported:
361
  // CLMUL (bit 1 in ecx)
362
  // SSSE3 (bit 9 in ecx)
363
  // SSE4.1 (bit 19 in ecx)
364
504
  const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
365
504
  return success && (r[2] & ecx_mask) == ecx_mask;
366
367
  // Alternative methods that weren't used:
368
  //   - ICC's _may_i_use_cpu_feature: the other methods should work too.
369
  //   - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
370
  //
371
  // CPUID decoding is needed with MSVC anyway and older GCC. This keeps
372
  // the feature checks in the build system simpler too. The nice thing
373
  // about __builtin_cpu_supports would be that it generates very short
374
  // code as is it only reads a variable set at startup but a few bytes
375
  // doesn't matter here.
376
504
}
crc32_fast.c:is_arch_extension_supported
Line
Count
Source
340
252
{
341
252
  int success = 1;
342
252
  uint32_t r[4]; // eax, ebx, ecx, edx
343
344
#if defined(_MSC_VER)
345
  // This needs <intrin.h> with MSVC. ICC has it as a built-in
346
  // on all platforms.
347
  __cpuid(r, 1);
348
#elif defined(HAVE_CPUID_H)
349
  // Compared to just using __asm__ to run CPUID, this also checks
350
  // that CPUID is supported and saves and restores ebx as that is
351
  // needed with GCC < 5 with position-independent code (PIC).
352
252
  success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
353
#else
354
  // Just a fallback that shouldn't be needed.
355
  __asm__("cpuid\n\t"
356
      : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
357
      : "a"(1), "c"(0));
358
#endif
359
360
  // Returns true if these are supported:
361
  // CLMUL (bit 1 in ecx)
362
  // SSSE3 (bit 9 in ecx)
363
  // SSE4.1 (bit 19 in ecx)
364
252
  const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
365
252
  return success && (r[2] & ecx_mask) == ecx_mask;
366
367
  // Alternative methods that weren't used:
368
  //   - ICC's _may_i_use_cpu_feature: the other methods should work too.
369
  //   - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
370
  //
371
  // CPUID decoding is needed with MSVC anyway and older GCC. This keeps
372
  // the feature checks in the build system simpler too. The nice thing
373
  // about __builtin_cpu_supports would be that it generates very short
374
  // code as is it only reads a variable set at startup but a few bytes
375
  // doesn't matter here.
376
252
}
crc64_fast.c:is_arch_extension_supported
Line
Count
Source
340
252
{
341
252
  int success = 1;
342
252
  uint32_t r[4]; // eax, ebx, ecx, edx
343
344
#if defined(_MSC_VER)
345
  // This needs <intrin.h> with MSVC. ICC has it as a built-in
346
  // on all platforms.
347
  __cpuid(r, 1);
348
#elif defined(HAVE_CPUID_H)
349
  // Compared to just using __asm__ to run CPUID, this also checks
350
  // that CPUID is supported and saves and restores ebx as that is
351
  // needed with GCC < 5 with position-independent code (PIC).
352
252
  success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
353
#else
354
  // Just a fallback that shouldn't be needed.
355
  __asm__("cpuid\n\t"
356
      : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
357
      : "a"(1), "c"(0));
358
#endif
359
360
  // Returns true if these are supported:
361
  // CLMUL (bit 1 in ecx)
362
  // SSSE3 (bit 9 in ecx)
363
  // SSE4.1 (bit 19 in ecx)
364
252
  const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
365
252
  return success && (r[2] & ecx_mask) == ecx_mask;
366
367
  // Alternative methods that weren't used:
368
  //   - ICC's _may_i_use_cpu_feature: the other methods should work too.
369
  //   - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
370
  //
371
  // CPUID decoding is needed with MSVC anyway and older GCC. This keeps
372
  // the feature checks in the build system simpler too. The nice thing
373
  // about __builtin_cpu_supports would be that it generates very short
374
  // code as is it only reads a variable set at startup but a few bytes
375
  // doesn't matter here.
376
252
}
377
#endif