Coverage Report

Created: 2026-05-04 06:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/xz/src/liblzma/check/crc_x86_clmul.h
Line
Count
Source
1
// SPDX-License-Identifier: 0BSD
2
3
///////////////////////////////////////////////////////////////////////////////
4
//
5
/// \file       crc_x86_clmul.h
6
/// \brief      CRC32 and CRC64 implementations using CLMUL instructions.
7
///
8
/// The CRC32 and CRC64 implementations use 32/64-bit x86 SSSE3, SSE4.1, and
9
/// CLMUL instructions. This is compatible with Elbrus 2000 (E2K) too.
10
///
11
/// See the Intel white paper "Fast CRC Computation for Generic Polynomials
12
/// Using PCLMULQDQ Instruction" from 2009. The original file seems to be
13
/// gone from Intel's website but a version is available here:
14
/// https://www.researchgate.net/publication/263424619_Fast_CRC_computation
15
/// (The link was checked on 2024-06-11.)
16
///
17
/// While this file has both CRC32 and CRC64 implementations, only one
18
/// can be built at a time. The version to build is selected by defining
19
/// BUILDING_CRC_CLMUL to 32 or 64 before including this file.
20
///
21
/// NOTE: The x86 CLMUL CRC implementation was rewritten for XZ Utils 5.8.0.
22
//
23
//  Authors:    Lasse Collin
24
//              Ilya Kurdyukov
25
//
26
///////////////////////////////////////////////////////////////////////////////
27
28
// This file must not be included more than once.
29
#ifdef LZMA_CRC_X86_CLMUL_H
30
# error crc_x86_clmul.h was included twice.
31
#endif
32
#define LZMA_CRC_X86_CLMUL_H
33
34
#if BUILDING_CRC_CLMUL != 32 && BUILDING_CRC_CLMUL != 64
35
# error BUILDING_CRC_CLMUL is undefined or has an invalid value
36
#endif
37
38
#include <immintrin.h>
39
40
#if defined(_MSC_VER)
41
# include <intrin.h>
42
#elif defined(HAVE_CPUID_H)
43
# include <cpuid.h>
44
#endif
45
46
47
// EDG-based compilers (Intel's classic compiler and compiler for E2K) can
48
// define __GNUC__ but the attribute must not be used with them.
49
// The new Clang-based ICX needs the attribute.
50
//
51
// NOTE: Build systems check for this too, keep them in sync with this.
52
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
53
# define crc_attr_target \
54
    __attribute__((__target__("ssse3,sse4.1,pclmul")))
55
#else
56
# define crc_attr_target
57
#endif
58
59
60
// GCC and Clang would produce good code with _mm_set_epi64x
61
// but MSVC needs _mm_cvtsi64_si128 on x86-64.
62
#if defined(__i386__) || defined(_M_IX86)
63
# define my_set_low64(a) _mm_set_epi64x(0, (a))
64
#else
65
1.19M
# define my_set_low64(a) _mm_cvtsi64_si128(a)
66
#endif
67
68
69
// Align it so that the whole array is within the same cache line.
70
// More than one unaligned load can be done from this during the
71
// same CRC function call.
72
//
73
// The bytes [0] to [31] are used with AND to clear the low bytes. (With ANDN
74
// those could be used to clear the high bytes too but it's not needed here.)
75
//
76
// The bytes [16] to [47] are for left shifts.
77
// The bytes [32] to [63] are for right shifts.
78
alignas(64)
79
static uint8_t vmasks[64] = {
80
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
81
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
82
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
83
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
84
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
85
  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
86
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
87
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
88
};
89
90
91
// *Unaligned* 128-bit load
92
crc_attr_target
93
static inline __m128i
94
my_load128(const uint8_t *p)
95
543k
{
96
543k
  return _mm_loadu_si128((const __m128i *)p);
97
543k
}
crc32_fast.c:my_load128
Line
Count
Source
95
416k
{
96
416k
  return _mm_loadu_si128((const __m128i *)p);
97
416k
}
crc64_fast.c:my_load128
Line
Count
Source
95
126k
{
96
126k
  return _mm_loadu_si128((const __m128i *)p);
97
126k
}
98
99
100
// Keep the highest "count" bytes as is and clear the remaining low bytes.
101
crc_attr_target
102
static inline __m128i
103
keep_high_bytes(__m128i v, size_t count)
104
5.12k
{
105
5.12k
  return _mm_and_si128(my_load128((vmasks + count)), v);
106
5.12k
}
crc32_fast.c:keep_high_bytes
Line
Count
Source
104
1.23k
{
105
1.23k
  return _mm_and_si128(my_load128((vmasks + count)), v);
106
1.23k
}
crc64_fast.c:keep_high_bytes
Line
Count
Source
104
3.89k
{
105
3.89k
  return _mm_and_si128(my_load128((vmasks + count)), v);
106
3.89k
}
107
108
109
// Shift the 128-bit value left by "amount" bytes (not bits).
110
crc_attr_target
111
static inline __m128i
112
shift_left(__m128i v, size_t amount)
113
406k
{
114
406k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 - amount)));
115
406k
}
crc32_fast.c:shift_left
Line
Count
Source
113
398k
{
114
398k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 - amount)));
115
398k
}
crc64_fast.c:shift_left
Line
Count
Source
113
7.73k
{
114
7.73k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 - amount)));
115
7.73k
}
116
117
118
// Shift the 128-bit value right by "amount" bytes (not bits).
119
crc_attr_target
120
static inline __m128i
121
shift_right(__m128i v, size_t amount)
122
5.12k
{
123
5.12k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 + amount)));
124
5.12k
}
crc32_fast.c:shift_right
Line
Count
Source
122
1.23k
{
123
1.23k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 + amount)));
124
1.23k
}
crc64_fast.c:shift_right
Line
Count
Source
122
3.89k
{
123
3.89k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 + amount)));
124
3.89k
}
125
126
127
crc_attr_target
128
static inline __m128i
129
fold(__m128i v, __m128i k)
130
115k
{
131
115k
  __m128i a = _mm_clmulepi64_si128(v, k, 0x00);
132
115k
  __m128i b = _mm_clmulepi64_si128(v, k, 0x11);
133
115k
  return _mm_xor_si128(a, b);
134
115k
}
crc32_fast.c:fold
Line
Count
Source
130
8.67k
{
131
8.67k
  __m128i a = _mm_clmulepi64_si128(v, k, 0x00);
132
  __m128i b = _mm_clmulepi64_si128(v, k, 0x11);
133
8.67k
  return _mm_xor_si128(a, b);
134
8.67k
}
crc64_fast.c:fold
Line
Count
Source
130
106k
{
131
106k
  __m128i a = _mm_clmulepi64_si128(v, k, 0x00);
132
  __m128i b = _mm_clmulepi64_si128(v, k, 0x11);
133
106k
  return _mm_xor_si128(a, b);
134
106k
}
135
136
137
crc_attr_target
138
static inline __m128i
139
fold_xor(__m128i v, __m128i k, const uint8_t *buf)
140
102k
{
141
102k
  return _mm_xor_si128(my_load128(buf), fold(v, k));
142
102k
}
crc32_fast.c:fold_xor
Line
Count
Source
140
7.04k
{
141
7.04k
  return _mm_xor_si128(my_load128(buf), fold(v, k));
142
7.04k
}
crc64_fast.c:fold_xor
Line
Count
Source
140
95.0k
{
141
95.0k
  return _mm_xor_si128(my_load128(buf), fold(v, k));
142
95.0k
}
143
144
145
#if BUILDING_CRC_CLMUL == 32
146
crc_attr_target
147
static uint32_t
148
crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
149
#else
150
crc_attr_target
151
static uint64_t
152
crc64_arch_optimized(const uint8_t *buf, size_t size, uint64_t crc)
153
#endif
154
1.19M
{
155
  // We will assume that there is at least one byte of input.
156
1.19M
  if (size == 0)
157
129
    return crc;
158
159
  // See crc_clmul_consts_gen.c.
160
#if BUILDING_CRC_CLMUL == 32
161
1.18M
  const __m128i fold512 = _mm_set_epi64x(0x1d9513d7, 0x8f352d95);
162
1.18M
  const __m128i fold128 = _mm_set_epi64x(0xccaa009e, 0xae689191);
163
1.18M
  const __m128i mu_p = _mm_set_epi64x(
164
1.18M
    (int64_t)0xb4e5b025f7011641, 0x1db710640);
165
#else
166
8.49k
  const __m128i fold512 = _mm_set_epi64x(
167
8.49k
    (int64_t)0x081f6054a7842df4, (int64_t)0x6ae3efbb9dd441f3);
168
169
  const __m128i fold128 = _mm_set_epi64x(
170
    (int64_t)0xdabe95afc7875f40, (int64_t)0xe05dd497ca393ae4);
171
172
8.49k
  const __m128i mu_p = _mm_set_epi64x(
173
8.49k
    (int64_t)0x9c3e466c172963d5, (int64_t)0x92d8af2baf0e1e84);
174
8.49k
#endif
175
176
8.49k
  __m128i v0, v1, v2, v3;
177
178
8.49k
  crc = ~crc;
179
180
1.19M
  if (size < 8) {
181
393k
    uint64_t x = crc;
182
393k
    size_t i = 0;
183
184
    // Checking the bit instead of comparing the size means
185
    // that we don't need to update the size between the steps.
186
393k
    if (size & 4) {
187
252k
      x ^= read32le(buf);
188
252k
      buf += 4;
189
252k
      i = 32;
190
252k
    }
191
192
393k
    if (size & 2) {
193
270k
      x ^= (uint64_t)read16le(buf) << i;
194
270k
      buf += 2;
195
270k
      i += 16;
196
270k
    }
197
198
393k
    if (size & 1)
199
1.99k
      x ^= (uint64_t)*buf << i;
200
201
393k
    v0 = my_set_low64((int64_t)x);
202
393k
    v0 = shift_left(v0, 8 - size);
203
204
803k
  } else if (size < 16) {
205
792k
    v0 = my_set_low64((int64_t)(crc ^ read64le(buf)));
206
207
    // NOTE: buf is intentionally left 8 bytes behind so that
208
    // we can read the last 1-7 bytes with read64le(buf + size).
209
792k
    size -= 8;
210
211
    // Handling 8-byte input specially is a speed optimization
212
    // as the clmul can be skipped. A branch is also needed to
213
    // avoid a too high shift amount.
214
792k
    if (size > 0) {
215
8.02k
      const size_t padding = 8 - size;
216
8.02k
      uint64_t high = read64le(buf + size) >> (padding * 8);
217
218
#if defined(__i386__) || defined(_M_IX86)
219
      // Simple but likely not the best code for 32-bit x86.
220
      v0 = _mm_insert_epi32(v0, (int32_t)high, 2);
221
      v0 = _mm_insert_epi32(v0, (int32_t)(high >> 32), 3);
222
#else
223
8.02k
      v0 = _mm_insert_epi64(v0, (int64_t)high, 1);
224
8.02k
#endif
225
226
8.02k
      v0 = shift_left(v0, padding);
227
228
8.02k
      v1 = _mm_srli_si128(v0, 8);
229
8.02k
      v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
230
8.02k
      v0 = _mm_xor_si128(v0, v1);
231
8.02k
    }
232
792k
  } else {
233
10.5k
    v0 = my_set_low64((int64_t)crc);
234
235
    // To align or not to align the buf pointer? If the end of
236
    // the buffer isn't aligned, aligning the pointer here would
237
    // make us do an extra folding step with the associated byte
238
    // shuffling overhead. The cost of that would need to be
239
    // lower than the benefit of aligned reads. Testing on an old
240
    // Intel Ivy Bridge processor suggested that aligning isn't
241
    // worth the cost but it likely depends on the processor and
242
    // buffer size. Unaligned loads (MOVDQU) should be fast on
243
    // x86 processors that support PCLMULQDQ, so we don't align
244
    // the buf pointer here.
245
246
    // Read the first (and possibly the only) full 16 bytes.
247
10.5k
    v0 = _mm_xor_si128(v0, my_load128(buf));
248
10.5k
    buf += 16;
249
10.5k
    size -= 16;
250
251
10.5k
    if (size >= 48) {
252
2.81k
      v1 = my_load128(buf);
253
2.81k
      v2 = my_load128(buf + 16);
254
2.81k
      v3 = my_load128(buf + 32);
255
2.81k
      buf += 48;
256
2.81k
      size -= 48;
257
258
26.1k
      while (size >= 64) {
259
23.3k
        v0 = fold_xor(v0, fold512, buf);
260
23.3k
        v1 = fold_xor(v1, fold512, buf + 16);
261
23.3k
        v2 = fold_xor(v2, fold512, buf + 32);
262
23.3k
        v3 = fold_xor(v3, fold512, buf + 48);
263
23.3k
        buf += 64;
264
23.3k
        size -= 64;
265
23.3k
      }
266
267
2.81k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
268
2.81k
      v0 = _mm_xor_si128(v2, fold(v0, fold128));
269
2.81k
      v0 = _mm_xor_si128(v3, fold(v0, fold128));
270
2.81k
    }
271
272
19.4k
    while (size >= 16) {
273
8.85k
      v0 = fold_xor(v0, fold128, buf);
274
8.85k
      buf += 16;
275
8.85k
      size -= 16;
276
8.85k
    }
277
278
10.5k
    if (size > 0) {
279
      // We want the last "size" number of input bytes to
280
      // be at the high bits of v1. First do a full 16-byte
281
      // load and then mask the low bytes to zeros.
282
5.12k
      v1 = my_load128(buf + size - 16);
283
5.12k
      v1 = keep_high_bytes(v1, size);
284
285
      // Shift high bytes from v0 to the low bytes of v1.
286
      //
287
      // Alternatively we could replace the combination
288
      // keep_high_bytes + shift_right + _mm_or_si128 with
289
      // _mm_shuffle_epi8 + _mm_blendv_epi8 but that would
290
      // require larger tables for the masks. Now there are
291
      // three loads (instead of two) from the mask tables
292
      // but they all are from the same cache line.
293
5.12k
      v1 = _mm_or_si128(v1, shift_right(v0, size));
294
295
      // Shift high bytes of v0 away, padding the
296
      // low bytes with zeros.
297
5.12k
      v0 = shift_left(v0, 16 - size);
298
299
5.12k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
300
5.12k
    }
301
302
10.5k
    v1 = _mm_srli_si128(v0, 8);
303
10.5k
    v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
304
10.5k
    v0 = _mm_xor_si128(v0, v1);
305
10.5k
  }
306
307
  // Barrett reduction
308
309
#if BUILDING_CRC_CLMUL == 32
310
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
311
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
312
  v0 = _mm_xor_si128(v0, v1);
313
  return ~(uint32_t)_mm_extract_epi32(v0, 2);
314
#else
315
  // Because p is 65 bits but one bit doesn't fit into the 64-bit
316
  // half of __m128i, finish the second clmul by shifting v1 left
317
  // by 64 bits and xorring it to the final result.
318
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
319
  v2 = _mm_slli_si128(v1, 8);
320
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
321
  v0 = _mm_xor_si128(v0, v2);
322
  v0 = _mm_xor_si128(v0, v1);
323
#if defined(__i386__) || defined(_M_IX86)
324
  return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
325
      (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
326
#else
327
  return ~(uint64_t)_mm_extract_epi64(v0, 1);
328
#endif
329
#endif
330
1.19M
}
crc32_fast.c:crc32_arch_optimized
Line
Count
Source
154
1.18M
{
155
  // We will assume that there is at least one byte of input.
156
1.18M
  if (size == 0)
157
129
    return crc;
158
159
  // See crc_clmul_consts_gen.c.
160
1.18M
#if BUILDING_CRC_CLMUL == 32
161
1.18M
  const __m128i fold512 = _mm_set_epi64x(0x1d9513d7, 0x8f352d95);
162
1.18M
  const __m128i fold128 = _mm_set_epi64x(0xccaa009e, 0xae689191);
163
1.18M
  const __m128i mu_p = _mm_set_epi64x(
164
1.18M
    (int64_t)0xb4e5b025f7011641, 0x1db710640);
165
#else
166
  const __m128i fold512 = _mm_set_epi64x(
167
    (int64_t)0x081f6054a7842df4, (int64_t)0x6ae3efbb9dd441f3);
168
169
  const __m128i fold128 = _mm_set_epi64x(
170
    (int64_t)0xdabe95afc7875f40, (int64_t)0xe05dd497ca393ae4);
171
172
  const __m128i mu_p = _mm_set_epi64x(
173
    (int64_t)0x9c3e466c172963d5, (int64_t)0x92d8af2baf0e1e84);
174
#endif
175
176
1.18M
  __m128i v0, v1, v2, v3;
177
178
1.18M
  crc = ~crc;
179
180
1.18M
  if (size < 8) {
181
390k
    uint64_t x = crc;
182
390k
    size_t i = 0;
183
184
    // Checking the bit instead of comparing the size means
185
    // that we don't need to update the size between the steps.
186
390k
    if (size & 4) {
187
250k
      x ^= read32le(buf);
188
250k
      buf += 4;
189
250k
      i = 32;
190
250k
    }
191
192
390k
    if (size & 2) {
193
268k
      x ^= (uint64_t)read16le(buf) << i;
194
268k
      buf += 2;
195
268k
      i += 16;
196
268k
    }
197
198
390k
    if (size & 1)
199
735
      x ^= (uint64_t)*buf << i;
200
201
390k
    v0 = my_set_low64((int64_t)x);
202
390k
    v0 = shift_left(v0, 8 - size);
203
204
797k
  } else if (size < 16) {
205
791k
    v0 = my_set_low64((int64_t)(crc ^ read64le(buf)));
206
207
    // NOTE: buf is intentionally left 8 bytes behind so that
208
    // we can read the last 1-7 bytes with read64le(buf + size).
209
791k
    size -= 8;
210
211
    // Handling 8-byte input specially is a speed optimization
212
    // as the clmul can be skipped. A branch is also needed to
213
    // avoid a too high shift amount.
214
791k
    if (size > 0) {
215
6.94k
      const size_t padding = 8 - size;
216
6.94k
      uint64_t high = read64le(buf + size) >> (padding * 8);
217
218
#if defined(__i386__) || defined(_M_IX86)
219
      // Simple but likely not the best code for 32-bit x86.
220
      v0 = _mm_insert_epi32(v0, (int32_t)high, 2);
221
      v0 = _mm_insert_epi32(v0, (int32_t)(high >> 32), 3);
222
#else
223
6.94k
      v0 = _mm_insert_epi64(v0, (int64_t)high, 1);
224
6.94k
#endif
225
226
6.94k
      v0 = shift_left(v0, padding);
227
228
6.94k
      v1 = _mm_srli_si128(v0, 8);
229
6.94k
      v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
230
6.94k
      v0 = _mm_xor_si128(v0, v1);
231
6.94k
    }
232
791k
  } else {
233
6.26k
    v0 = my_set_low64((int64_t)crc);
234
235
    // To align or not to align the buf pointer? If the end of
236
    // the buffer isn't aligned, aligning the pointer here would
237
    // make us do an extra folding step with the associated byte
238
    // shuffling overhead. The cost of that would need to be
239
    // lower than the benefit of aligned reads. Testing on an old
240
    // Intel Ivy Bridge processor suggested that aligning isn't
241
    // worth the cost but it likely depends on the processor and
242
    // buffer size. Unaligned loads (MOVDQU) should be fast on
243
    // x86 processors that support PCLMULQDQ, so we don't align
244
    // the buf pointer here.
245
246
    // Read the first (and possibly the only) full 16 bytes.
247
6.26k
    v0 = _mm_xor_si128(v0, my_load128(buf));
248
6.26k
    buf += 16;
249
6.26k
    size -= 16;
250
251
6.26k
    if (size >= 48) {
252
133
      v1 = my_load128(buf);
253
133
      v2 = my_load128(buf + 16);
254
133
      v3 = my_load128(buf + 32);
255
133
      buf += 48;
256
133
      size -= 48;
257
258
876
      while (size >= 64) {
259
743
        v0 = fold_xor(v0, fold512, buf);
260
743
        v1 = fold_xor(v1, fold512, buf + 16);
261
743
        v2 = fold_xor(v2, fold512, buf + 32);
262
743
        v3 = fold_xor(v3, fold512, buf + 48);
263
743
        buf += 64;
264
743
        size -= 64;
265
743
      }
266
267
133
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
268
133
      v0 = _mm_xor_si128(v2, fold(v0, fold128));
269
133
      v0 = _mm_xor_si128(v3, fold(v0, fold128));
270
133
    }
271
272
10.3k
    while (size >= 16) {
273
4.06k
      v0 = fold_xor(v0, fold128, buf);
274
4.06k
      buf += 16;
275
4.06k
      size -= 16;
276
4.06k
    }
277
278
6.26k
    if (size > 0) {
279
      // We want the last "size" number of input bytes to
280
      // be at the high bits of v1. First do a full 16-byte
281
      // load and then mask the low bytes to zeros.
282
1.23k
      v1 = my_load128(buf + size - 16);
283
1.23k
      v1 = keep_high_bytes(v1, size);
284
285
      // Shift high bytes from v0 to the low bytes of v1.
286
      //
287
      // Alternatively we could replace the combination
288
      // keep_high_bytes + shift_right + _mm_or_si128 with
289
      // _mm_shuffle_epi8 + _mm_blendv_epi8 but that would
290
      // require larger tables for the masks. Now there are
291
      // three loads (instead of two) from the mask tables
292
      // but they all are from the same cache line.
293
1.23k
      v1 = _mm_or_si128(v1, shift_right(v0, size));
294
295
      // Shift high bytes of v0 away, padding the
296
      // low bytes with zeros.
297
1.23k
      v0 = shift_left(v0, 16 - size);
298
299
1.23k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
300
1.23k
    }
301
302
6.26k
    v1 = _mm_srli_si128(v0, 8);
303
6.26k
    v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
304
6.26k
    v0 = _mm_xor_si128(v0, v1);
305
6.26k
  }
306
307
  // Barrett reduction
308
309
1.18M
#if BUILDING_CRC_CLMUL == 32
310
1.18M
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
311
1.18M
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
312
1.18M
  v0 = _mm_xor_si128(v0, v1);
313
1.18M
  return ~(uint32_t)_mm_extract_epi32(v0, 2);
314
#else
315
  // Because p is 65 bits but one bit doesn't fit into the 64-bit
316
  // half of __m128i, finish the second clmul by shifting v1 left
317
  // by 64 bits and xorring it to the final result.
318
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
319
  v2 = _mm_slli_si128(v1, 8);
320
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
321
  v0 = _mm_xor_si128(v0, v2);
322
  v0 = _mm_xor_si128(v0, v1);
323
#if defined(__i386__) || defined(_M_IX86)
324
  return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
325
      (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
326
#else
327
  return ~(uint64_t)_mm_extract_epi64(v0, 1);
328
#endif
329
#endif
330
1.18M
}
crc64_fast.c:crc64_arch_optimized
Line
Count
Source
154
8.49k
{
155
  // We will assume that there is at least one byte of input.
156
8.49k
  if (size == 0)
157
0
    return crc;
158
159
  // See crc_clmul_consts_gen.c.
160
#if BUILDING_CRC_CLMUL == 32
161
  const __m128i fold512 = _mm_set_epi64x(0x1d9513d7, 0x8f352d95);
162
  const __m128i fold128 = _mm_set_epi64x(0xccaa009e, 0xae689191);
163
  const __m128i mu_p = _mm_set_epi64x(
164
    (int64_t)0xb4e5b025f7011641, 0x1db710640);
165
#else
166
8.49k
  const __m128i fold512 = _mm_set_epi64x(
167
8.49k
    (int64_t)0x081f6054a7842df4, (int64_t)0x6ae3efbb9dd441f3);
168
169
8.49k
  const __m128i fold128 = _mm_set_epi64x(
170
8.49k
    (int64_t)0xdabe95afc7875f40, (int64_t)0xe05dd497ca393ae4);
171
172
8.49k
  const __m128i mu_p = _mm_set_epi64x(
173
8.49k
    (int64_t)0x9c3e466c172963d5, (int64_t)0x92d8af2baf0e1e84);
174
8.49k
#endif
175
176
8.49k
  __m128i v0, v1, v2, v3;
177
178
8.49k
  crc = ~crc;
179
180
8.49k
  if (size < 8) {
181
2.76k
    uint64_t x = crc;
182
2.76k
    size_t i = 0;
183
184
    // Checking the bit instead of comparing the size means
185
    // that we don't need to update the size between the steps.
186
2.76k
    if (size & 4) {
187
1.64k
      x ^= read32le(buf);
188
1.64k
      buf += 4;
189
1.64k
      i = 32;
190
1.64k
    }
191
192
2.76k
    if (size & 2) {
193
1.52k
      x ^= (uint64_t)read16le(buf) << i;
194
1.52k
      buf += 2;
195
1.52k
      i += 16;
196
1.52k
    }
197
198
2.76k
    if (size & 1)
199
1.26k
      x ^= (uint64_t)*buf << i;
200
201
2.76k
    v0 = my_set_low64((int64_t)x);
202
2.76k
    v0 = shift_left(v0, 8 - size);
203
204
5.73k
  } else if (size < 16) {
205
1.43k
    v0 = my_set_low64((int64_t)(crc ^ read64le(buf)));
206
207
    // NOTE: buf is intentionally left 8 bytes behind so that
208
    // we can read the last 1-7 bytes with read64le(buf + size).
209
1.43k
    size -= 8;
210
211
    // Handling 8-byte input specially is a speed optimization
212
    // as the clmul can be skipped. A branch is also needed to
213
    // avoid a too high shift amount.
214
1.43k
    if (size > 0) {
215
1.08k
      const size_t padding = 8 - size;
216
1.08k
      uint64_t high = read64le(buf + size) >> (padding * 8);
217
218
#if defined(__i386__) || defined(_M_IX86)
219
      // Simple but likely not the best code for 32-bit x86.
220
      v0 = _mm_insert_epi32(v0, (int32_t)high, 2);
221
      v0 = _mm_insert_epi32(v0, (int32_t)(high >> 32), 3);
222
#else
223
1.08k
      v0 = _mm_insert_epi64(v0, (int64_t)high, 1);
224
1.08k
#endif
225
226
1.08k
      v0 = shift_left(v0, padding);
227
228
1.08k
      v1 = _mm_srli_si128(v0, 8);
229
1.08k
      v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
230
1.08k
      v0 = _mm_xor_si128(v0, v1);
231
1.08k
    }
232
4.29k
  } else {
233
4.29k
    v0 = my_set_low64((int64_t)crc);
234
235
    // To align or not to align the buf pointer? If the end of
236
    // the buffer isn't aligned, aligning the pointer here would
237
    // make us do an extra folding step with the associated byte
238
    // shuffling overhead. The cost of that would need to be
239
    // lower than the benefit of aligned reads. Testing on an old
240
    // Intel Ivy Bridge processor suggested that aligning isn't
241
    // worth the cost but it likely depends on the processor and
242
    // buffer size. Unaligned loads (MOVDQU) should be fast on
243
    // x86 processors that support PCLMULQDQ, so we don't align
244
    // the buf pointer here.
245
246
    // Read the first (and possibly the only) full 16 bytes.
247
4.29k
    v0 = _mm_xor_si128(v0, my_load128(buf));
248
4.29k
    buf += 16;
249
4.29k
    size -= 16;
250
251
4.29k
    if (size >= 48) {
252
2.68k
      v1 = my_load128(buf);
253
2.68k
      v2 = my_load128(buf + 16);
254
2.68k
      v3 = my_load128(buf + 32);
255
2.68k
      buf += 48;
256
2.68k
      size -= 48;
257
258
25.2k
      while (size >= 64) {
259
22.5k
        v0 = fold_xor(v0, fold512, buf);
260
22.5k
        v1 = fold_xor(v1, fold512, buf + 16);
261
22.5k
        v2 = fold_xor(v2, fold512, buf + 32);
262
22.5k
        v3 = fold_xor(v3, fold512, buf + 48);
263
22.5k
        buf += 64;
264
22.5k
        size -= 64;
265
22.5k
      }
266
267
2.68k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
268
2.68k
      v0 = _mm_xor_si128(v2, fold(v0, fold128));
269
2.68k
      v0 = _mm_xor_si128(v3, fold(v0, fold128));
270
2.68k
    }
271
272
9.08k
    while (size >= 16) {
273
4.78k
      v0 = fold_xor(v0, fold128, buf);
274
4.78k
      buf += 16;
275
4.78k
      size -= 16;
276
4.78k
    }
277
278
4.29k
    if (size > 0) {
279
      // We want the last "size" number of input bytes to
280
      // be at the high bits of v1. First do a full 16-byte
281
      // load and then mask the low bytes to zeros.
282
3.89k
      v1 = my_load128(buf + size - 16);
283
3.89k
      v1 = keep_high_bytes(v1, size);
284
285
      // Shift high bytes from v0 to the low bytes of v1.
286
      //
287
      // Alternatively we could replace the combination
288
      // keep_high_bytes + shift_right + _mm_or_si128 with
289
      // _mm_shuffle_epi8 + _mm_blendv_epi8 but that would
290
      // require larger tables for the masks. Now there are
291
      // three loads (instead of two) from the mask tables
292
      // but they all are from the same cache line.
293
3.89k
      v1 = _mm_or_si128(v1, shift_right(v0, size));
294
295
      // Shift high bytes of v0 away, padding the
296
      // low bytes with zeros.
297
3.89k
      v0 = shift_left(v0, 16 - size);
298
299
3.89k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
300
3.89k
    }
301
302
4.29k
    v1 = _mm_srli_si128(v0, 8);
303
4.29k
    v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
304
4.29k
    v0 = _mm_xor_si128(v0, v1);
305
4.29k
  }
306
307
  // Barrett reduction
308
309
#if BUILDING_CRC_CLMUL == 32
310
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
311
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
312
  v0 = _mm_xor_si128(v0, v1);
313
  return ~(uint32_t)_mm_extract_epi32(v0, 2);
314
#else
315
  // Because p is 65 bits but one bit doesn't fit into the 64-bit
316
  // half of __m128i, finish the second clmul by shifting v1 left
317
  // by 64 bits and xorring it to the final result.
318
8.49k
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
319
8.49k
  v2 = _mm_slli_si128(v1, 8);
320
8.49k
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
321
8.49k
  v0 = _mm_xor_si128(v0, v2);
322
8.49k
  v0 = _mm_xor_si128(v0, v1);
323
#if defined(__i386__) || defined(_M_IX86)
324
  return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
325
      (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
326
#else
327
  return ~(uint64_t)_mm_extract_epi64(v0, 1);
328
8.49k
#endif
329
8.49k
#endif
330
8.49k
}
331
332
333
// Even though this is an inline function, compile it only when needed.
334
// This way it won't appear in E2K builds at all.
335
#if defined(CRC32_GENERIC) || defined(CRC64_GENERIC)
336
// Inlining this function duplicates the function body in crc32_resolve() and
337
// crc64_resolve(), but this is acceptable because this is a tiny function.
338
static inline bool
339
is_arch_extension_supported(void)
340
12
{
341
12
  int success = 1;
342
12
  uint32_t r[4]; // eax, ebx, ecx, edx
343
344
#if defined(_MSC_VER)
345
  // This needs <intrin.h> with MSVC. ICC has it as a built-in
346
  // on all platforms.
347
  __cpuid((int *)r, 1);
348
#elif defined(HAVE_CPUID_H)
349
  // Compared to just using __asm__ to run CPUID, this also checks
350
  // that CPUID is supported and saves and restores ebx as that is
351
  // needed with GCC < 5 with position-independent code (PIC).
352
12
  success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
353
#else
354
  // Just a fallback that shouldn't be needed.
355
  __asm__("cpuid\n\t"
356
      : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
357
      : "a"(1), "c"(0));
358
#endif
359
360
  // Returns true if these are supported:
361
  // CLMUL (bit 1 in ecx)
362
  // SSSE3 (bit 9 in ecx)
363
  // SSE4.1 (bit 19 in ecx)
364
12
  const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
365
12
  return success && (r[2] & ecx_mask) == ecx_mask;
366
367
  // Alternative methods that weren't used:
368
  //   - ICC's _may_i_use_cpu_feature: the other methods should work too.
369
  //   - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
370
  //
371
  // CPUID decoding is needed with MSVC anyway and older GCC. This keeps
372
  // the feature checks in the build system simpler too. The nice thing
373
  // about __builtin_cpu_supports would be that it generates very short
374
  // code as is it only reads a variable set at startup but a few bytes
375
  // doesn't matter here.
376
12
}
crc32_fast.c:is_arch_extension_supported
Line
Count
Source
340
6
{
341
6
  int success = 1;
342
6
  uint32_t r[4]; // eax, ebx, ecx, edx
343
344
#if defined(_MSC_VER)
345
  // This needs <intrin.h> with MSVC. ICC has it as a built-in
346
  // on all platforms.
347
  __cpuid((int *)r, 1);
348
#elif defined(HAVE_CPUID_H)
349
  // Compared to just using __asm__ to run CPUID, this also checks
350
  // that CPUID is supported and saves and restores ebx as that is
351
  // needed with GCC < 5 with position-independent code (PIC).
352
6
  success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
353
#else
354
  // Just a fallback that shouldn't be needed.
355
  __asm__("cpuid\n\t"
356
      : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
357
      : "a"(1), "c"(0));
358
#endif
359
360
  // Returns true if these are supported:
361
  // CLMUL (bit 1 in ecx)
362
  // SSSE3 (bit 9 in ecx)
363
  // SSE4.1 (bit 19 in ecx)
364
6
  const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
365
6
  return success && (r[2] & ecx_mask) == ecx_mask;
366
367
  // Alternative methods that weren't used:
368
  //   - ICC's _may_i_use_cpu_feature: the other methods should work too.
369
  //   - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
370
  //
371
  // CPUID decoding is needed with MSVC anyway and older GCC. This keeps
372
  // the feature checks in the build system simpler too. The nice thing
373
  // about __builtin_cpu_supports would be that it generates very short
374
  // code as is it only reads a variable set at startup but a few bytes
375
  // doesn't matter here.
376
6
}
crc64_fast.c:is_arch_extension_supported
Line
Count
Source
340
6
{
341
6
  int success = 1;
342
6
  uint32_t r[4]; // eax, ebx, ecx, edx
343
344
#if defined(_MSC_VER)
345
  // This needs <intrin.h> with MSVC. ICC has it as a built-in
346
  // on all platforms.
347
  __cpuid((int *)r, 1);
348
#elif defined(HAVE_CPUID_H)
349
  // Compared to just using __asm__ to run CPUID, this also checks
350
  // that CPUID is supported and saves and restores ebx as that is
351
  // needed with GCC < 5 with position-independent code (PIC).
352
6
  success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
353
#else
354
  // Just a fallback that shouldn't be needed.
355
  __asm__("cpuid\n\t"
356
      : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
357
      : "a"(1), "c"(0));
358
#endif
359
360
  // Returns true if these are supported:
361
  // CLMUL (bit 1 in ecx)
362
  // SSSE3 (bit 9 in ecx)
363
  // SSE4.1 (bit 19 in ecx)
364
6
  const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
365
6
  return success && (r[2] & ecx_mask) == ecx_mask;
366
367
  // Alternative methods that weren't used:
368
  //   - ICC's _may_i_use_cpu_feature: the other methods should work too.
369
  //   - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
370
  //
371
  // CPUID decoding is needed with MSVC anyway and older GCC. This keeps
372
  // the feature checks in the build system simpler too. The nice thing
373
  // about __builtin_cpu_supports would be that it generates very short
374
  // code as is it only reads a variable set at startup but a few bytes
375
  // doesn't matter here.
376
6
}
377
#endif