Coverage Report

Created: 2026-02-14 07:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/xz/src/liblzma/check/crc_x86_clmul.h
Line
Count
Source
1
// SPDX-License-Identifier: 0BSD
2
3
///////////////////////////////////////////////////////////////////////////////
4
//
5
/// \file       crc_x86_clmul.h
6
/// \brief      CRC32 and CRC64 implementations using CLMUL instructions.
7
///
8
/// The CRC32 and CRC64 implementations use 32/64-bit x86 SSSE3, SSE4.1, and
9
/// CLMUL instructions. This is compatible with Elbrus 2000 (E2K) too.
10
///
11
/// See the Intel white paper "Fast CRC Computation for Generic Polynomials
12
/// Using PCLMULQDQ Instruction" from 2009. The original file seems to be
13
/// gone from Intel's website but a version is available here:
14
/// https://www.researchgate.net/publication/263424619_Fast_CRC_computation
15
/// (The link was checked on 2024-06-11.)
16
///
17
/// While this file has both CRC32 and CRC64 implementations, only one
18
/// can be built at a time. The version to build is selected by defining
19
/// BUILDING_CRC_CLMUL to 32 or 64 before including this file.
20
///
21
/// NOTE: The x86 CLMUL CRC implementation was rewritten for XZ Utils 5.8.0.
22
//
23
//  Authors:    Lasse Collin
24
//              Ilya Kurdyukov
25
//
26
///////////////////////////////////////////////////////////////////////////////
27
28
// This file must not be included more than once.
29
#ifdef LZMA_CRC_X86_CLMUL_H
30
# error crc_x86_clmul.h was included twice.
31
#endif
32
#define LZMA_CRC_X86_CLMUL_H
33
34
#if BUILDING_CRC_CLMUL != 32 && BUILDING_CRC_CLMUL != 64
35
# error BUILDING_CRC_CLMUL is undefined or has an invalid value
36
#endif
37
38
#include <immintrin.h>
39
40
#if defined(_MSC_VER)
41
# include <intrin.h>
42
#elif defined(HAVE_CPUID_H)
43
# include <cpuid.h>
44
#endif
45
46
47
// EDG-based compilers (Intel's classic compiler and compiler for E2K) can
48
// define __GNUC__ but the attribute must not be used with them.
49
// The new Clang-based ICX needs the attribute.
50
//
51
// NOTE: Build systems check for this too, keep them in sync with this.
52
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
53
# define crc_attr_target \
54
    __attribute__((__target__("ssse3,sse4.1,pclmul")))
55
#else
56
# define crc_attr_target
57
#endif
58
59
60
// GCC and Clang would produce good code with _mm_set_epi64x
61
// but MSVC needs _mm_cvtsi64_si128 on x86-64.
62
#if defined(__i386__) || defined(_M_IX86)
63
# define my_set_low64(a) _mm_set_epi64x(0, (a))
64
#else
65
45.7k
# define my_set_low64(a) _mm_cvtsi64_si128(a)
66
#endif
67
68
69
// Align it so that the whole array is within the same cache line.
70
// More than one unaligned load can be done from this during the
71
// same CRC function call.
72
//
73
// The bytes [0] to [31] are used with AND to clear the low bytes. (With ANDN
74
// those could be used to clear the high bytes too but it's not needed here.)
75
//
76
// The bytes [16] to [47] are for left shifts.
77
// The bytes [32] to [63] are for right shifts.
78
alignas(64)
79
static uint8_t vmasks[64] = {
80
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
81
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
82
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
83
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
84
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
85
  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
86
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
87
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
88
};
89
90
91
// *Unaligned* 128-bit load
92
crc_attr_target
93
static inline __m128i
94
my_load128(const uint8_t *p)
95
562k
{
96
562k
  return _mm_loadu_si128((const __m128i *)p);
97
562k
}
crc32_fast.c:my_load128
Line
Count
Source
95
147k
{
96
147k
  return _mm_loadu_si128((const __m128i *)p);
97
147k
}
crc64_fast.c:my_load128
Line
Count
Source
95
414k
{
96
414k
  return _mm_loadu_si128((const __m128i *)p);
97
414k
}
98
99
100
// Keep the highest "count" bytes as is and clear the remaining low bytes.
101
crc_attr_target
102
static inline __m128i
103
keep_high_bytes(__m128i v, size_t count)
104
11.1k
{
105
11.1k
  return _mm_and_si128(my_load128((vmasks + count)), v);
106
11.1k
}
crc32_fast.c:keep_high_bytes
Line
Count
Source
104
8.01k
{
105
8.01k
  return _mm_and_si128(my_load128((vmasks + count)), v);
106
8.01k
}
crc64_fast.c:keep_high_bytes
Line
Count
Source
104
3.09k
{
105
3.09k
  return _mm_and_si128(my_load128((vmasks + count)), v);
106
3.09k
}
107
108
109
// Shift the 128-bit value left by "amount" bytes (not bits).
110
crc_attr_target
111
static inline __m128i
112
shift_left(__m128i v, size_t amount)
113
42.3k
{
114
42.3k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 - amount)));
115
42.3k
}
crc32_fast.c:shift_left
Line
Count
Source
113
27.3k
{
114
27.3k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 - amount)));
115
27.3k
}
crc64_fast.c:shift_left
Line
Count
Source
113
15.0k
{
114
15.0k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 - amount)));
115
15.0k
}
116
117
118
// Shift the 128-bit value right by "amount" bytes (not bits).
119
crc_attr_target
120
static inline __m128i
121
shift_right(__m128i v, size_t amount)
122
11.1k
{
123
11.1k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 + amount)));
124
11.1k
}
crc32_fast.c:shift_right
Line
Count
Source
122
8.01k
{
123
8.01k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 + amount)));
124
8.01k
}
crc64_fast.c:shift_right
Line
Count
Source
122
3.09k
{
123
3.09k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 + amount)));
124
3.09k
}
125
126
127
crc_attr_target
128
static inline __m128i
129
fold(__m128i v, __m128i k)
130
485k
{
131
485k
  __m128i a = _mm_clmulepi64_si128(v, k, 0x00);
132
485k
  __m128i b = _mm_clmulepi64_si128(v, k, 0x11);
133
485k
  return _mm_xor_si128(a, b);
134
485k
}
crc32_fast.c:fold
Line
Count
Source
130
95.8k
{
131
95.8k
  __m128i a = _mm_clmulepi64_si128(v, k, 0x00);
132
  __m128i b = _mm_clmulepi64_si128(v, k, 0x11);
133
95.8k
  return _mm_xor_si128(a, b);
134
95.8k
}
crc64_fast.c:fold
Line
Count
Source
130
389k
{
131
389k
  __m128i a = _mm_clmulepi64_si128(v, k, 0x00);
132
  __m128i b = _mm_clmulepi64_si128(v, k, 0x11);
133
389k
  return _mm_xor_si128(a, b);
134
389k
}
135
136
137
crc_attr_target
138
static inline __m128i
139
fold_xor(__m128i v, __m128i k, const uint8_t *buf)
140
446k
{
141
446k
  return _mm_xor_si128(my_load128(buf), fold(v, k));
142
446k
}
crc32_fast.c:fold_xor
Line
Count
Source
140
67.1k
{
141
67.1k
  return _mm_xor_si128(my_load128(buf), fold(v, k));
142
67.1k
}
crc64_fast.c:fold_xor
Line
Count
Source
140
379k
{
141
379k
  return _mm_xor_si128(my_load128(buf), fold(v, k));
142
379k
}
143
144
145
#if BUILDING_CRC_CLMUL == 32
146
crc_attr_target
147
static uint32_t
148
crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
149
#else
150
crc_attr_target
151
static uint64_t
152
crc64_arch_optimized(const uint8_t *buf, size_t size, uint64_t crc)
153
#endif
154
45.7k
{
155
  // We will assume that there is at least one byte of input.
156
45.7k
  if (size == 0)
157
4
    return crc;
158
159
  // See crc_clmul_consts_gen.c.
160
#if BUILDING_CRC_CLMUL == 32
161
29.9k
  const __m128i fold512 = _mm_set_epi64x(0x1d9513d7, 0x8f352d95);
162
29.9k
  const __m128i fold128 = _mm_set_epi64x(0xccaa009e, 0xae689191);
163
29.9k
  const __m128i mu_p = _mm_set_epi64x(
164
29.9k
    (int64_t)0xb4e5b025f7011641, 0x1db710640);
165
#else
166
15.7k
  const __m128i fold512 = _mm_set_epi64x(
167
15.7k
    (int64_t)0x081f6054a7842df4, (int64_t)0x6ae3efbb9dd441f3);
168
169
  const __m128i fold128 = _mm_set_epi64x(
170
    (int64_t)0xdabe95afc7875f40, (int64_t)0xe05dd497ca393ae4);
171
172
15.7k
  const __m128i mu_p = _mm_set_epi64x(
173
15.7k
    (int64_t)0x9c3e466c172963d5, (int64_t)0x92d8af2baf0e1e84);
174
15.7k
#endif
175
176
15.7k
  __m128i v0, v1, v2, v3;
177
178
15.7k
  crc = ~crc;
179
180
45.7k
  if (size < 8) {
181
12.9k
    uint64_t x = crc;
182
12.9k
    size_t i = 0;
183
184
    // Checking the bit instead of comparing the size means
185
    // that we don't need to update the size between the steps.
186
12.9k
    if (size & 4) {
187
4.11k
      x ^= read32le(buf);
188
4.11k
      buf += 4;
189
4.11k
      i = 32;
190
4.11k
    }
191
192
12.9k
    if (size & 2) {
193
11.9k
      x ^= (uint64_t)read16le(buf) << i;
194
11.9k
      buf += 2;
195
11.9k
      i += 16;
196
11.9k
    }
197
198
12.9k
    if (size & 1)
199
4.73k
      x ^= (uint64_t)*buf << i;
200
201
12.9k
    v0 = my_set_low64((int64_t)x);
202
12.9k
    v0 = shift_left(v0, 8 - size);
203
204
32.7k
  } else if (size < 16) {
205
20.4k
    v0 = my_set_low64((int64_t)(crc ^ read64le(buf)));
206
207
    // NOTE: buf is intentionally left 8 bytes behind so that
208
    // we can read the last 1-7 bytes with read64le(buf + size).
209
20.4k
    size -= 8;
210
211
    // Handling 8-byte input specially is a speed optimization
212
    // as the clmul can be skipped. A branch is also needed to
213
    // avoid a too high shift amount.
214
20.4k
    if (size > 0) {
215
18.2k
      const size_t padding = 8 - size;
216
18.2k
      uint64_t high = read64le(buf + size) >> (padding * 8);
217
218
#if defined(__i386__) || defined(_M_IX86)
219
      // Simple but likely not the best code for 32-bit x86.
220
      v0 = _mm_insert_epi32(v0, (int32_t)high, 2);
221
      v0 = _mm_insert_epi32(v0, (int32_t)(high >> 32), 3);
222
#else
223
18.2k
      v0 = _mm_insert_epi64(v0, (int64_t)high, 1);
224
18.2k
#endif
225
226
18.2k
      v0 = shift_left(v0, padding);
227
228
18.2k
      v1 = _mm_srli_si128(v0, 8);
229
18.2k
      v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
230
18.2k
      v0 = _mm_xor_si128(v0, v1);
231
18.2k
    }
232
20.4k
  } else {
233
12.2k
    v0 = my_set_low64((int64_t)crc);
234
235
    // To align or not to align the buf pointer? If the end of
236
    // the buffer isn't aligned, aligning the pointer here would
237
    // make us do an extra folding step with the associated byte
238
    // shuffling overhead. The cost of that would need to be
239
    // lower than the benefit of aligned reads. Testing on an old
240
    // Intel Ivy Bridge processor suggested that aligning isn't
241
    // worth the cost but it likely depends on the processor and
242
    // buffer size. Unaligned loads (MOVDQU) should be fast on
243
    // x86 processors that support PCLMULQDQ, so we don't align
244
    // the buf pointer here.
245
246
    // Read the first (and possibly the only) full 16 bytes.
247
12.2k
    v0 = _mm_xor_si128(v0, my_load128(buf));
248
12.2k
    buf += 16;
249
12.2k
    size -= 16;
250
251
12.2k
    if (size >= 48) {
252
9.41k
      v1 = my_load128(buf);
253
9.41k
      v2 = my_load128(buf + 16);
254
9.41k
      v3 = my_load128(buf + 32);
255
9.41k
      buf += 48;
256
9.41k
      size -= 48;
257
258
118k
      while (size >= 64) {
259
109k
        v0 = fold_xor(v0, fold512, buf);
260
109k
        v1 = fold_xor(v1, fold512, buf + 16);
261
109k
        v2 = fold_xor(v2, fold512, buf + 32);
262
109k
        v3 = fold_xor(v3, fold512, buf + 48);
263
109k
        buf += 64;
264
109k
        size -= 64;
265
109k
      }
266
267
9.41k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
268
9.41k
      v0 = _mm_xor_si128(v2, fold(v0, fold128));
269
9.41k
      v0 = _mm_xor_si128(v3, fold(v0, fold128));
270
9.41k
    }
271
272
22.4k
    while (size >= 16) {
273
10.1k
      v0 = fold_xor(v0, fold128, buf);
274
10.1k
      buf += 16;
275
10.1k
      size -= 16;
276
10.1k
    }
277
278
12.2k
    if (size > 0) {
279
      // We want the last "size" number of input bytes to
280
      // be at the high bits of v1. First do a full 16-byte
281
      // load and then mask the low bytes to zeros.
282
11.1k
      v1 = my_load128(buf + size - 16);
283
11.1k
      v1 = keep_high_bytes(v1, size);
284
285
      // Shift high bytes from v0 to the low bytes of v1.
286
      //
287
      // Alternatively we could replace the combination
288
      // keep_high_bytes + shift_right + _mm_or_si128 with
289
      // _mm_shuffle_epi8 + _mm_blendv_epi8 but that would
290
      // require larger tables for the masks. Now there are
291
      // three loads (instead of two) from the mask tables
292
      // but they all are from the same cache line.
293
11.1k
      v1 = _mm_or_si128(v1, shift_right(v0, size));
294
295
      // Shift high bytes of v0 away, padding the
296
      // low bytes with zeros.
297
11.1k
      v0 = shift_left(v0, 16 - size);
298
299
11.1k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
300
11.1k
    }
301
302
12.2k
    v1 = _mm_srli_si128(v0, 8);
303
12.2k
    v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
304
12.2k
    v0 = _mm_xor_si128(v0, v1);
305
12.2k
  }
306
307
  // Barrett reduction
308
309
#if BUILDING_CRC_CLMUL == 32
310
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
311
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
312
  v0 = _mm_xor_si128(v0, v1);
313
  return ~(uint32_t)_mm_extract_epi32(v0, 2);
314
#else
315
  // Because p is 65 bits but one bit doesn't fit into the 64-bit
316
  // half of __m128i, finish the second clmul by shifting v1 left
317
  // by 64 bits and xorring it to the final result.
318
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
319
  v2 = _mm_slli_si128(v1, 8);
320
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
321
  v0 = _mm_xor_si128(v0, v2);
322
  v0 = _mm_xor_si128(v0, v1);
323
#if defined(__i386__) || defined(_M_IX86)
324
  return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
325
      (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
326
#else
327
  return ~(uint64_t)_mm_extract_epi64(v0, 1);
328
#endif
329
#endif
330
45.7k
}
crc32_fast.c:crc32_arch_optimized
Line
Count
Source
154
29.9k
{
155
  // We will assume that there is at least one byte of input.
156
29.9k
  if (size == 0)
157
4
    return crc;
158
159
  // See crc_clmul_consts_gen.c.
160
29.9k
#if BUILDING_CRC_CLMUL == 32
161
29.9k
  const __m128i fold512 = _mm_set_epi64x(0x1d9513d7, 0x8f352d95);
162
29.9k
  const __m128i fold128 = _mm_set_epi64x(0xccaa009e, 0xae689191);
163
29.9k
  const __m128i mu_p = _mm_set_epi64x(
164
29.9k
    (int64_t)0xb4e5b025f7011641, 0x1db710640);
165
#else
166
  const __m128i fold512 = _mm_set_epi64x(
167
    (int64_t)0x081f6054a7842df4, (int64_t)0x6ae3efbb9dd441f3);
168
169
  const __m128i fold128 = _mm_set_epi64x(
170
    (int64_t)0xdabe95afc7875f40, (int64_t)0xe05dd497ca393ae4);
171
172
  const __m128i mu_p = _mm_set_epi64x(
173
    (int64_t)0x9c3e466c172963d5, (int64_t)0x92d8af2baf0e1e84);
174
#endif
175
176
29.9k
  __m128i v0, v1, v2, v3;
177
178
29.9k
  crc = ~crc;
179
180
29.9k
  if (size < 8) {
181
9.05k
    uint64_t x = crc;
182
9.05k
    size_t i = 0;
183
184
    // Checking the bit instead of comparing the size means
185
    // that we don't need to update the size between the steps.
186
9.05k
    if (size & 4) {
187
2.43k
      x ^= read32le(buf);
188
2.43k
      buf += 4;
189
2.43k
      i = 32;
190
2.43k
    }
191
192
9.05k
    if (size & 2) {
193
8.35k
      x ^= (uint64_t)read16le(buf) << i;
194
8.35k
      buf += 2;
195
8.35k
      i += 16;
196
8.35k
    }
197
198
9.05k
    if (size & 1)
199
2.51k
      x ^= (uint64_t)*buf << i;
200
201
9.05k
    v0 = my_set_low64((int64_t)x);
202
9.05k
    v0 = shift_left(v0, 8 - size);
203
204
20.9k
  } else if (size < 16) {
205
12.2k
    v0 = my_set_low64((int64_t)(crc ^ read64le(buf)));
206
207
    // NOTE: buf is intentionally left 8 bytes behind so that
208
    // we can read the last 1-7 bytes with read64le(buf + size).
209
12.2k
    size -= 8;
210
211
    // Handling 8-byte input specially is a speed optimization
212
    // as the clmul can be skipped. A branch is also needed to
213
    // avoid a too high shift amount.
214
12.2k
    if (size > 0) {
215
10.2k
      const size_t padding = 8 - size;
216
10.2k
      uint64_t high = read64le(buf + size) >> (padding * 8);
217
218
#if defined(__i386__) || defined(_M_IX86)
219
      // Simple but likely not the best code for 32-bit x86.
220
      v0 = _mm_insert_epi32(v0, (int32_t)high, 2);
221
      v0 = _mm_insert_epi32(v0, (int32_t)(high >> 32), 3);
222
#else
223
10.2k
      v0 = _mm_insert_epi64(v0, (int64_t)high, 1);
224
10.2k
#endif
225
226
10.2k
      v0 = shift_left(v0, padding);
227
228
10.2k
      v1 = _mm_srli_si128(v0, 8);
229
10.2k
      v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
230
10.2k
      v0 = _mm_xor_si128(v0, v1);
231
10.2k
    }
232
12.2k
  } else {
233
8.70k
    v0 = my_set_low64((int64_t)crc);
234
235
    // To align or not to align the buf pointer? If the end of
236
    // the buffer isn't aligned, aligning the pointer here would
237
    // make us do an extra folding step with the associated byte
238
    // shuffling overhead. The cost of that would need to be
239
    // lower than the benefit of aligned reads. Testing on an old
240
    // Intel Ivy Bridge processor suggested that aligning isn't
241
    // worth the cost but it likely depends on the processor and
242
    // buffer size. Unaligned loads (MOVDQU) should be fast on
243
    // x86 processors that support PCLMULQDQ, so we don't align
244
    // the buf pointer here.
245
246
    // Read the first (and possibly the only) full 16 bytes.
247
8.70k
    v0 = _mm_xor_si128(v0, my_load128(buf));
248
8.70k
    buf += 16;
249
8.70k
    size -= 16;
250
251
8.70k
    if (size >= 48) {
252
6.90k
      v1 = my_load128(buf);
253
6.90k
      v2 = my_load128(buf + 16);
254
6.90k
      v3 = my_load128(buf + 32);
255
6.90k
      buf += 48;
256
6.90k
      size -= 48;
257
258
22.2k
      while (size >= 64) {
259
15.3k
        v0 = fold_xor(v0, fold512, buf);
260
15.3k
        v1 = fold_xor(v1, fold512, buf + 16);
261
15.3k
        v2 = fold_xor(v2, fold512, buf + 32);
262
15.3k
        v3 = fold_xor(v3, fold512, buf + 48);
263
15.3k
        buf += 64;
264
15.3k
        size -= 64;
265
15.3k
      }
266
267
6.90k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
268
6.90k
      v0 = _mm_xor_si128(v2, fold(v0, fold128));
269
6.90k
      v0 = _mm_xor_si128(v3, fold(v0, fold128));
270
6.90k
    }
271
272
14.5k
    while (size >= 16) {
273
5.85k
      v0 = fold_xor(v0, fold128, buf);
274
5.85k
      buf += 16;
275
5.85k
      size -= 16;
276
5.85k
    }
277
278
8.70k
    if (size > 0) {
279
      // We want the last "size" number of input bytes to
280
      // be at the high bits of v1. First do a full 16-byte
281
      // load and then mask the low bytes to zeros.
282
8.01k
      v1 = my_load128(buf + size - 16);
283
8.01k
      v1 = keep_high_bytes(v1, size);
284
285
      // Shift high bytes from v0 to the low bytes of v1.
286
      //
287
      // Alternatively we could replace the combination
288
      // keep_high_bytes + shift_right + _mm_or_si128 with
289
      // _mm_shuffle_epi8 + _mm_blendv_epi8 but that would
290
      // require larger tables for the masks. Now there are
291
      // three loads (instead of two) from the mask tables
292
      // but they all are from the same cache line.
293
8.01k
      v1 = _mm_or_si128(v1, shift_right(v0, size));
294
295
      // Shift high bytes of v0 away, padding the
296
      // low bytes with zeros.
297
8.01k
      v0 = shift_left(v0, 16 - size);
298
299
8.01k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
300
8.01k
    }
301
302
8.70k
    v1 = _mm_srli_si128(v0, 8);
303
8.70k
    v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
304
8.70k
    v0 = _mm_xor_si128(v0, v1);
305
8.70k
  }
306
307
  // Barrett reduction
308
309
29.9k
#if BUILDING_CRC_CLMUL == 32
310
29.9k
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
311
29.9k
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
312
29.9k
  v0 = _mm_xor_si128(v0, v1);
313
29.9k
  return ~(uint32_t)_mm_extract_epi32(v0, 2);
314
#else
315
  // Because p is 65 bits but one bit doesn't fit into the 64-bit
316
  // half of __m128i, finish the second clmul by shifting v1 left
317
  // by 64 bits and xorring it to the final result.
318
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
319
  v2 = _mm_slli_si128(v1, 8);
320
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
321
  v0 = _mm_xor_si128(v0, v2);
322
  v0 = _mm_xor_si128(v0, v1);
323
#if defined(__i386__) || defined(_M_IX86)
324
  return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
325
      (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
326
#else
327
  return ~(uint64_t)_mm_extract_epi64(v0, 1);
328
#endif
329
#endif
330
29.9k
}
crc64_fast.c:crc64_arch_optimized
Line
Count
Source
154
15.7k
{
155
  // We will assume that there is at least one byte of input.
156
15.7k
  if (size == 0)
157
0
    return crc;
158
159
  // See crc_clmul_consts_gen.c.
160
#if BUILDING_CRC_CLMUL == 32
161
  const __m128i fold512 = _mm_set_epi64x(0x1d9513d7, 0x8f352d95);
162
  const __m128i fold128 = _mm_set_epi64x(0xccaa009e, 0xae689191);
163
  const __m128i mu_p = _mm_set_epi64x(
164
    (int64_t)0xb4e5b025f7011641, 0x1db710640);
165
#else
166
15.7k
  const __m128i fold512 = _mm_set_epi64x(
167
15.7k
    (int64_t)0x081f6054a7842df4, (int64_t)0x6ae3efbb9dd441f3);
168
169
15.7k
  const __m128i fold128 = _mm_set_epi64x(
170
15.7k
    (int64_t)0xdabe95afc7875f40, (int64_t)0xe05dd497ca393ae4);
171
172
15.7k
  const __m128i mu_p = _mm_set_epi64x(
173
15.7k
    (int64_t)0x9c3e466c172963d5, (int64_t)0x92d8af2baf0e1e84);
174
15.7k
#endif
175
176
15.7k
  __m128i v0, v1, v2, v3;
177
178
15.7k
  crc = ~crc;
179
180
15.7k
  if (size < 8) {
181
3.90k
    uint64_t x = crc;
182
3.90k
    size_t i = 0;
183
184
    // Checking the bit instead of comparing the size means
185
    // that we don't need to update the size between the steps.
186
3.90k
    if (size & 4) {
187
1.67k
      x ^= read32le(buf);
188
1.67k
      buf += 4;
189
1.67k
      i = 32;
190
1.67k
    }
191
192
3.90k
    if (size & 2) {
193
3.63k
      x ^= (uint64_t)read16le(buf) << i;
194
3.63k
      buf += 2;
195
3.63k
      i += 16;
196
3.63k
    }
197
198
3.90k
    if (size & 1)
199
2.21k
      x ^= (uint64_t)*buf << i;
200
201
3.90k
    v0 = my_set_low64((int64_t)x);
202
3.90k
    v0 = shift_left(v0, 8 - size);
203
204
11.8k
  } else if (size < 16) {
205
8.25k
    v0 = my_set_low64((int64_t)(crc ^ read64le(buf)));
206
207
    // NOTE: buf is intentionally left 8 bytes behind so that
208
    // we can read the last 1-7 bytes with read64le(buf + size).
209
8.25k
    size -= 8;
210
211
    // Handling 8-byte input specially is a speed optimization
212
    // as the clmul can be skipped. A branch is also needed to
213
    // avoid a too high shift amount.
214
8.25k
    if (size > 0) {
215
8.01k
      const size_t padding = 8 - size;
216
8.01k
      uint64_t high = read64le(buf + size) >> (padding * 8);
217
218
#if defined(__i386__) || defined(_M_IX86)
219
      // Simple but likely not the best code for 32-bit x86.
220
      v0 = _mm_insert_epi32(v0, (int32_t)high, 2);
221
      v0 = _mm_insert_epi32(v0, (int32_t)(high >> 32), 3);
222
#else
223
8.01k
      v0 = _mm_insert_epi64(v0, (int64_t)high, 1);
224
8.01k
#endif
225
226
8.01k
      v0 = shift_left(v0, padding);
227
228
8.01k
      v1 = _mm_srli_si128(v0, 8);
229
8.01k
      v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
230
8.01k
      v0 = _mm_xor_si128(v0, v1);
231
8.01k
    }
232
8.25k
  } else {
233
3.56k
    v0 = my_set_low64((int64_t)crc);
234
235
    // To align or not to align the buf pointer? If the end of
236
    // the buffer isn't aligned, aligning the pointer here would
237
    // make us do an extra folding step with the associated byte
238
    // shuffling overhead. The cost of that would need to be
239
    // lower than the benefit of aligned reads. Testing on an old
240
    // Intel Ivy Bridge processor suggested that aligning isn't
241
    // worth the cost but it likely depends on the processor and
242
    // buffer size. Unaligned loads (MOVDQU) should be fast on
243
    // x86 processors that support PCLMULQDQ, so we don't align
244
    // the buf pointer here.
245
246
    // Read the first (and possibly the only) full 16 bytes.
247
3.56k
    v0 = _mm_xor_si128(v0, my_load128(buf));
248
3.56k
    buf += 16;
249
3.56k
    size -= 16;
250
251
3.56k
    if (size >= 48) {
252
2.50k
      v1 = my_load128(buf);
253
2.50k
      v2 = my_load128(buf + 16);
254
2.50k
      v3 = my_load128(buf + 32);
255
2.50k
      buf += 48;
256
2.50k
      size -= 48;
257
258
96.2k
      while (size >= 64) {
259
93.7k
        v0 = fold_xor(v0, fold512, buf);
260
93.7k
        v1 = fold_xor(v1, fold512, buf + 16);
261
93.7k
        v2 = fold_xor(v2, fold512, buf + 32);
262
93.7k
        v3 = fold_xor(v3, fold512, buf + 48);
263
93.7k
        buf += 64;
264
93.7k
        size -= 64;
265
93.7k
      }
266
267
2.50k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
268
2.50k
      v0 = _mm_xor_si128(v2, fold(v0, fold128));
269
2.50k
      v0 = _mm_xor_si128(v3, fold(v0, fold128));
270
2.50k
    }
271
272
7.84k
    while (size >= 16) {
273
4.27k
      v0 = fold_xor(v0, fold128, buf);
274
4.27k
      buf += 16;
275
4.27k
      size -= 16;
276
4.27k
    }
277
278
3.56k
    if (size > 0) {
279
      // We want the last "size" number of input bytes to
280
      // be at the high bits of v1. First do a full 16-byte
281
      // load and then mask the low bytes to zeros.
282
3.09k
      v1 = my_load128(buf + size - 16);
283
3.09k
      v1 = keep_high_bytes(v1, size);
284
285
      // Shift high bytes from v0 to the low bytes of v1.
286
      //
287
      // Alternatively we could replace the combination
288
      // keep_high_bytes + shift_right + _mm_or_si128 with
289
      // _mm_shuffle_epi8 + _mm_blendv_epi8 but that would
290
      // require larger tables for the masks. Now there are
291
      // three loads (instead of two) from the mask tables
292
      // but they all are from the same cache line.
293
3.09k
      v1 = _mm_or_si128(v1, shift_right(v0, size));
294
295
      // Shift high bytes of v0 away, padding the
296
      // low bytes with zeros.
297
3.09k
      v0 = shift_left(v0, 16 - size);
298
299
3.09k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
300
3.09k
    }
301
302
3.56k
    v1 = _mm_srli_si128(v0, 8);
303
3.56k
    v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
304
3.56k
    v0 = _mm_xor_si128(v0, v1);
305
3.56k
  }
306
307
  // Barrett reduction
308
309
#if BUILDING_CRC_CLMUL == 32
310
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
311
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
312
  v0 = _mm_xor_si128(v0, v1);
313
  return ~(uint32_t)_mm_extract_epi32(v0, 2);
314
#else
315
  // Because p is 65 bits but one bit doesn't fit into the 64-bit
316
  // half of __m128i, finish the second clmul by shifting v1 left
317
  // by 64 bits and xorring it to the final result.
318
15.7k
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
319
15.7k
  v2 = _mm_slli_si128(v1, 8);
320
15.7k
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
321
15.7k
  v0 = _mm_xor_si128(v0, v2);
322
15.7k
  v0 = _mm_xor_si128(v0, v1);
323
#if defined(__i386__) || defined(_M_IX86)
324
  return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
325
      (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
326
#else
327
  return ~(uint64_t)_mm_extract_epi64(v0, 1);
328
15.7k
#endif
329
15.7k
#endif
330
15.7k
}
331
332
333
// Even though this is an inline function, compile it only when needed.
334
// This way it won't appear in E2K builds at all.
335
#if defined(CRC32_GENERIC) || defined(CRC64_GENERIC)
336
// Inlining this function duplicates the function body in crc32_resolve() and
337
// crc64_resolve(), but this is acceptable because this is a tiny function.
338
static inline bool
339
is_arch_extension_supported(void)
340
592
{
341
592
  int success = 1;
342
592
  uint32_t r[4]; // eax, ebx, ecx, edx
343
344
#if defined(_MSC_VER)
345
  // This needs <intrin.h> with MSVC. ICC has it as a built-in
346
  // on all platforms.
347
  __cpuid(r, 1);
348
#elif defined(HAVE_CPUID_H)
349
  // Compared to just using __asm__ to run CPUID, this also checks
350
  // that CPUID is supported and saves and restores ebx as that is
351
  // needed with GCC < 5 with position-independent code (PIC).
352
592
  success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
353
#else
354
  // Just a fallback that shouldn't be needed.
355
  __asm__("cpuid\n\t"
356
      : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
357
      : "a"(1), "c"(0));
358
#endif
359
360
  // Returns true if these are supported:
361
  // CLMUL (bit 1 in ecx)
362
  // SSSE3 (bit 9 in ecx)
363
  // SSE4.1 (bit 19 in ecx)
364
592
  const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
365
592
  return success && (r[2] & ecx_mask) == ecx_mask;
366
367
  // Alternative methods that weren't used:
368
  //   - ICC's _may_i_use_cpu_feature: the other methods should work too.
369
  //   - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
370
  //
371
  // CPUID decoding is needed with MSVC anyway and older GCC. This keeps
372
  // the feature checks in the build system simpler too. The nice thing
373
  // about __builtin_cpu_supports would be that it generates very short
374
  // code as is it only reads a variable set at startup but a few bytes
375
  // doesn't matter here.
376
592
}
crc32_fast.c:is_arch_extension_supported
Line
Count
Source
340
296
{
341
296
  int success = 1;
342
296
  uint32_t r[4]; // eax, ebx, ecx, edx
343
344
#if defined(_MSC_VER)
345
  // This needs <intrin.h> with MSVC. ICC has it as a built-in
346
  // on all platforms.
347
  __cpuid(r, 1);
348
#elif defined(HAVE_CPUID_H)
349
  // Compared to just using __asm__ to run CPUID, this also checks
350
  // that CPUID is supported and saves and restores ebx as that is
351
  // needed with GCC < 5 with position-independent code (PIC).
352
296
  success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
353
#else
354
  // Just a fallback that shouldn't be needed.
355
  __asm__("cpuid\n\t"
356
      : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
357
      : "a"(1), "c"(0));
358
#endif
359
360
  // Returns true if these are supported:
361
  // CLMUL (bit 1 in ecx)
362
  // SSSE3 (bit 9 in ecx)
363
  // SSE4.1 (bit 19 in ecx)
364
296
  const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
365
296
  return success && (r[2] & ecx_mask) == ecx_mask;
366
367
  // Alternative methods that weren't used:
368
  //   - ICC's _may_i_use_cpu_feature: the other methods should work too.
369
  //   - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
370
  //
371
  // CPUID decoding is needed with MSVC anyway and older GCC. This keeps
372
  // the feature checks in the build system simpler too. The nice thing
373
  // about __builtin_cpu_supports would be that it generates very short
374
  // code as is it only reads a variable set at startup but a few bytes
375
  // doesn't matter here.
376
296
}
crc64_fast.c:is_arch_extension_supported
Line
Count
Source
340
296
{
341
296
  int success = 1;
342
296
  uint32_t r[4]; // eax, ebx, ecx, edx
343
344
#if defined(_MSC_VER)
345
  // This needs <intrin.h> with MSVC. ICC has it as a built-in
346
  // on all platforms.
347
  __cpuid(r, 1);
348
#elif defined(HAVE_CPUID_H)
349
  // Compared to just using __asm__ to run CPUID, this also checks
350
  // that CPUID is supported and saves and restores ebx as that is
351
  // needed with GCC < 5 with position-independent code (PIC).
352
296
  success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
353
#else
354
  // Just a fallback that shouldn't be needed.
355
  __asm__("cpuid\n\t"
356
      : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
357
      : "a"(1), "c"(0));
358
#endif
359
360
  // Returns true if these are supported:
361
  // CLMUL (bit 1 in ecx)
362
  // SSSE3 (bit 9 in ecx)
363
  // SSE4.1 (bit 19 in ecx)
364
296
  const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
365
296
  return success && (r[2] & ecx_mask) == ecx_mask;
366
367
  // Alternative methods that weren't used:
368
  //   - ICC's _may_i_use_cpu_feature: the other methods should work too.
369
  //   - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
370
  //
371
  // CPUID decoding is needed with MSVC anyway and older GCC. This keeps
372
  // the feature checks in the build system simpler too. The nice thing
373
  // about __builtin_cpu_supports would be that it generates very short
374
  // code as is it only reads a variable set at startup but a few bytes
375
  // doesn't matter here.
376
296
}
377
#endif