Coverage Report

Created: 2025-08-29 07:00

/src/xz/src/liblzma/check/crc_x86_clmul.h
Line
Count
Source
1
// SPDX-License-Identifier: 0BSD
2
3
///////////////////////////////////////////////////////////////////////////////
4
//
5
/// \file       crc_x86_clmul.h
6
/// \brief      CRC32 and CRC64 implementations using CLMUL instructions.
7
///
8
/// The CRC32 and CRC64 implementations use 32/64-bit x86 SSSE3, SSE4.1, and
9
/// CLMUL instructions. This is compatible with Elbrus 2000 (E2K) too.
10
///
11
/// See the Intel white paper "Fast CRC Computation for Generic Polynomials
12
/// Using PCLMULQDQ Instruction" from 2009. The original file seems to be
13
/// gone from Intel's website but a version is available here:
14
/// https://www.researchgate.net/publication/263424619_Fast_CRC_computation
15
/// (The link was checked on 2024-06-11.)
16
///
17
/// While this file has both CRC32 and CRC64 implementations, only one
18
/// can be built at a time. The version to build is selected by defining
19
/// BUILDING_CRC_CLMUL to 32 or 64 before including this file.
20
///
21
/// NOTE: The x86 CLMUL CRC implementation was rewritten for XZ Utils 5.8.0.
22
//
23
//  Authors:    Lasse Collin
24
//              Ilya Kurdyukov
25
//
26
///////////////////////////////////////////////////////////////////////////////
27
28
// This file must not be included more than once.
29
#ifdef LZMA_CRC_X86_CLMUL_H
30
# error crc_x86_clmul.h was included twice.
31
#endif
32
#define LZMA_CRC_X86_CLMUL_H
33
34
#if BUILDING_CRC_CLMUL != 32 && BUILDING_CRC_CLMUL != 64
35
# error BUILDING_CRC_CLMUL is undefined or has an invalid value
36
#endif
37
38
#include <immintrin.h>
39
40
#if defined(_MSC_VER)
41
# include <intrin.h>
42
#elif defined(HAVE_CPUID_H)
43
# include <cpuid.h>
44
#endif
45
46
47
// EDG-based compilers (Intel's classic compiler and compiler for E2K) can
48
// define __GNUC__ but the attribute must not be used with them.
49
// The new Clang-based ICX needs the attribute.
50
//
51
// NOTE: Build systems check for this too, keep them in sync with this.
52
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
53
# define crc_attr_target \
54
    __attribute__((__target__("ssse3,sse4.1,pclmul")))
55
#else
56
# define crc_attr_target
57
#endif
58
59
60
// GCC and Clang would produce good code with _mm_set_epi64x
61
// but MSVC needs _mm_cvtsi64_si128 on x86-64.
62
#if defined(__i386__) || defined(_M_IX86)
63
# define my_set_low64(a) _mm_set_epi64x(0, (a))
64
#else
65
1.02M
# define my_set_low64(a) _mm_cvtsi64_si128(a)
66
#endif
67
68
69
// Align it so that the whole array is within the same cache line.
70
// More than one unaligned load can be done from this during the
71
// same CRC function call.
72
//
73
// The bytes [0] to [31] are used with AND to clear the low bytes. (With ANDN
74
// those could be used to clear the high bytes too but it's not needed here.)
75
//
76
// The bytes [16] to [47] are for left shifts.
77
// The bytes [32] to [63] are for right shifts.
78
alignas(64)
79
static uint8_t vmasks[64] = {
80
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
81
  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
82
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
83
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
84
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
85
  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
86
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
87
  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
88
};
89
90
91
// *Unaligned* 128-bit load
92
crc_attr_target
93
static inline __m128i
94
my_load128(const uint8_t *p)
95
482k
{
96
482k
  return _mm_loadu_si128((const __m128i *)p);
97
482k
}
crc32_fast.c:my_load128
Line
Count
Source
95
358k
{
96
358k
  return _mm_loadu_si128((const __m128i *)p);
97
358k
}
crc64_fast.c:my_load128
Line
Count
Source
95
123k
{
96
123k
  return _mm_loadu_si128((const __m128i *)p);
97
123k
}
98
99
100
// Keep the highest "count" bytes as is and clear the remaining low bytes.
101
crc_attr_target
102
static inline __m128i
103
keep_high_bytes(__m128i v, size_t count)
104
4.64k
{
105
4.64k
  return _mm_and_si128(my_load128((vmasks + count)), v);
106
4.64k
}
crc32_fast.c:keep_high_bytes
Line
Count
Source
104
876
{
105
876
  return _mm_and_si128(my_load128((vmasks + count)), v);
106
876
}
crc64_fast.c:keep_high_bytes
Line
Count
Source
104
3.77k
{
105
3.77k
  return _mm_and_si128(my_load128((vmasks + count)), v);
106
3.77k
}
107
108
109
// Shift the 128-bit value left by "amount" bytes (not bits).
110
crc_attr_target
111
static inline __m128i
112
shift_left(__m128i v, size_t amount)
113
355k
{
114
355k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 - amount)));
115
355k
}
crc32_fast.c:shift_left
Line
Count
Source
113
347k
{
114
347k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 - amount)));
115
347k
}
crc64_fast.c:shift_left
Line
Count
Source
113
7.60k
{
114
7.60k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 - amount)));
115
7.60k
}
116
117
118
// Shift the 128-bit value right by "amount" bytes (not bits).
119
crc_attr_target
120
static inline __m128i
121
shift_right(__m128i v, size_t amount)
122
4.64k
{
123
4.64k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 + amount)));
124
4.64k
}
crc32_fast.c:shift_right
Line
Count
Source
122
876
{
123
876
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 + amount)));
124
876
}
crc64_fast.c:shift_right
Line
Count
Source
122
3.77k
{
123
3.77k
  return _mm_shuffle_epi8(v, my_load128((vmasks + 32 + amount)));
124
3.77k
}
125
126
127
crc_attr_target
128
static inline __m128i
129
fold(__m128i v, __m128i k)
130
109k
{
131
109k
  __m128i a = _mm_clmulepi64_si128(v, k, 0x00);
132
109k
  __m128i b = _mm_clmulepi64_si128(v, k, 0x11);
133
109k
  return _mm_xor_si128(a, b);
134
109k
}
crc32_fast.c:fold
Line
Count
Source
130
5.44k
{
131
5.44k
  __m128i a = _mm_clmulepi64_si128(v, k, 0x00);
132
5.44k
  __m128i b = _mm_clmulepi64_si128(v, k, 0x11);
133
5.44k
  return _mm_xor_si128(a, b);
134
5.44k
}
crc64_fast.c:fold
Line
Count
Source
130
104k
{
131
104k
  __m128i a = _mm_clmulepi64_si128(v, k, 0x00);
132
104k
  __m128i b = _mm_clmulepi64_si128(v, k, 0x11);
133
104k
  return _mm_xor_si128(a, b);
134
104k
}
135
136
137
crc_attr_target
138
static inline __m128i
139
fold_xor(__m128i v, __m128i k, const uint8_t *buf)
140
96.9k
{
141
96.9k
  return _mm_xor_si128(my_load128(buf), fold(v, k));
142
96.9k
}
crc32_fast.c:fold_xor
Line
Count
Source
140
4.24k
{
141
4.24k
  return _mm_xor_si128(my_load128(buf), fold(v, k));
142
4.24k
}
crc64_fast.c:fold_xor
Line
Count
Source
140
92.6k
{
141
92.6k
  return _mm_xor_si128(my_load128(buf), fold(v, k));
142
92.6k
}
143
144
145
#if BUILDING_CRC_CLMUL == 32
146
crc_attr_target
147
static uint32_t
148
crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
149
#else
150
crc_attr_target
151
static uint64_t
152
crc64_arch_optimized(const uint8_t *buf, size_t size, uint64_t crc)
153
#endif
154
1.02M
{
155
  // We will assume that there is at least one byte of input.
156
1.02M
  if (size == 0)
157
110
    return crc;
158
159
  // See crc_clmul_consts_gen.c.
160
#if BUILDING_CRC_CLMUL == 32
161
1.01M
  const __m128i fold512 = _mm_set_epi64x(0x1d9513d7, 0x8f352d95);
162
1.01M
  const __m128i fold128 = _mm_set_epi64x(0xccaa009e, 0xae689191);
163
1.01M
  const __m128i mu_p = _mm_set_epi64x(
164
1.01M
    (int64_t)0xb4e5b025f7011641, 0x1db710640);
165
#else
166
8.37k
  const __m128i fold512 = _mm_set_epi64x(
167
8.37k
    (int64_t)0x081f6054a7842df4, (int64_t)0x6ae3efbb9dd441f3);
168
169
  const __m128i fold128 = _mm_set_epi64x(
170
    (int64_t)0xdabe95afc7875f40, (int64_t)0xe05dd497ca393ae4);
171
172
8.37k
  const __m128i mu_p = _mm_set_epi64x(
173
8.37k
    (int64_t)0x9c3e466c172963d5, (int64_t)0x92d8af2baf0e1e84);
174
8.37k
#endif
175
176
8.37k
  __m128i v0, v1, v2, v3;
177
178
8.37k
  crc = ~crc;
179
180
1.02M
  if (size < 8) {
181
342k
    uint64_t x = crc;
182
342k
    size_t i = 0;
183
184
    // Checking the bit instead of comparing the size means
185
    // that we don't need to update the size between the steps.
186
342k
    if (size & 4) {
187
218k
      x ^= read32le(buf);
188
218k
      buf += 4;
189
218k
      i = 32;
190
218k
    }
191
192
342k
    if (size & 2) {
193
236k
      x ^= (uint64_t)read16le(buf) << i;
194
236k
      buf += 2;
195
236k
      i += 16;
196
236k
    }
197
198
342k
    if (size & 1)
199
1.93k
      x ^= (uint64_t)*buf << i;
200
201
342k
    v0 = my_set_low64((int64_t)x);
202
342k
    v0 = shift_left(v0, 8 - size);
203
204
677k
  } else if (size < 16) {
205
669k
    v0 = my_set_low64((int64_t)(crc ^ read64le(buf)));
206
207
    // NOTE: buf is intentionally left 8 bytes behind so that
208
    // we can read the last 1-7 bytes with read64le(buf + size).
209
669k
    size -= 8;
210
211
    // Handling 8-byte input specially is a speed optimization
212
    // as the clmul can be skipped. A branch is also needed to
213
    // avoid a too high shift amount.
214
669k
    if (size > 0) {
215
8.04k
      const size_t padding = 8 - size;
216
8.04k
      uint64_t high = read64le(buf + size) >> (padding * 8);
217
218
#if defined(__i386__) || defined(_M_IX86)
219
      // Simple but likely not the best code for 32-bit x86.
220
      v0 = _mm_insert_epi32(v0, (int32_t)high, 2);
221
      v0 = _mm_insert_epi32(v0, (int32_t)(high >> 32), 3);
222
#else
223
8.04k
      v0 = _mm_insert_epi64(v0, (int64_t)high, 1);
224
8.04k
#endif
225
226
8.04k
      v0 = shift_left(v0, padding);
227
228
8.04k
      v1 = _mm_srli_si128(v0, 8);
229
8.04k
      v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
230
8.04k
      v0 = _mm_xor_si128(v0, v1);
231
8.04k
    }
232
669k
  } else {
233
8.00k
    v0 = my_set_low64((int64_t)crc);
234
235
    // To align or not to align the buf pointer? If the end of
236
    // the buffer isn't aligned, aligning the pointer here would
237
    // make us do an extra folding step with the associated byte
238
    // shuffling overhead. The cost of that would need to be
239
    // lower than the benefit of aligned reads. Testing on an old
240
    // Intel Ivy Bridge processor suggested that aligning isn't
241
    // worth the cost but it likely depends on the processor and
242
    // buffer size. Unaligned loads (MOVDQU) should be fast on
243
    // x86 processors that support PCLMULQDQ, so we don't align
244
    // the buf pointer here.
245
246
    // Read the first (and possibly the only) full 16 bytes.
247
8.00k
    v0 = _mm_xor_si128(v0, my_load128(buf));
248
8.00k
    buf += 16;
249
8.00k
    size -= 16;
250
251
8.00k
    if (size >= 48) {
252
2.73k
      v1 = my_load128(buf);
253
2.73k
      v2 = my_load128(buf + 16);
254
2.73k
      v3 = my_load128(buf + 32);
255
2.73k
      buf += 48;
256
2.73k
      size -= 48;
257
258
25.3k
      while (size >= 64) {
259
22.5k
        v0 = fold_xor(v0, fold512, buf);
260
22.5k
        v1 = fold_xor(v1, fold512, buf + 16);
261
22.5k
        v2 = fold_xor(v2, fold512, buf + 32);
262
22.5k
        v3 = fold_xor(v3, fold512, buf + 48);
263
22.5k
        buf += 64;
264
22.5k
        size -= 64;
265
22.5k
      }
266
267
2.73k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
268
2.73k
      v0 = _mm_xor_si128(v2, fold(v0, fold128));
269
2.73k
      v0 = _mm_xor_si128(v3, fold(v0, fold128));
270
2.73k
    }
271
272
14.6k
    while (size >= 16) {
273
6.60k
      v0 = fold_xor(v0, fold128, buf);
274
6.60k
      buf += 16;
275
6.60k
      size -= 16;
276
6.60k
    }
277
278
8.00k
    if (size > 0) {
279
      // We want the last "size" number of input bytes to
280
      // be at the high bits of v1. First do a full 16-byte
281
      // load and then mask the low bytes to zeros.
282
4.64k
      v1 = my_load128(buf + size - 16);
283
4.64k
      v1 = keep_high_bytes(v1, size);
284
285
      // Shift high bytes from v0 to the low bytes of v1.
286
      //
287
      // Alternatively we could replace the combination
288
      // keep_high_bytes + shift_right + _mm_or_si128 with
289
      // _mm_shuffle_epi8 + _mm_blendv_epi8 but that would
290
      // require larger tables for the masks. Now there are
291
      // three loads (instead of two) from the mask tables
292
      // but they all are from the same cache line.
293
4.64k
      v1 = _mm_or_si128(v1, shift_right(v0, size));
294
295
      // Shift high bytes of v0 away, padding the
296
      // low bytes with zeros.
297
4.64k
      v0 = shift_left(v0, 16 - size);
298
299
4.64k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
300
4.64k
    }
301
302
8.00k
    v1 = _mm_srli_si128(v0, 8);
303
8.00k
    v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
304
8.00k
    v0 = _mm_xor_si128(v0, v1);
305
8.00k
  }
306
307
  // Barrett reduction
308
309
#if BUILDING_CRC_CLMUL == 32
310
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
311
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
312
  v0 = _mm_xor_si128(v0, v1);
313
  return ~(uint32_t)_mm_extract_epi32(v0, 2);
314
#else
315
  // Because p is 65 bits but one bit doesn't fit into the 64-bit
316
  // half of __m128i, finish the second clmul by shifting v1 left
317
  // by 64 bits and xorring it to the final result.
318
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
319
  v2 = _mm_slli_si128(v1, 8);
320
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
321
  v0 = _mm_xor_si128(v0, v2);
322
  v0 = _mm_xor_si128(v0, v1);
323
#if defined(__i386__) || defined(_M_IX86)
324
  return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
325
      (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
326
#else
327
8.37k
  return ~(uint64_t)_mm_extract_epi64(v0, 1);
328
#endif
329
#endif
330
1.02M
}
crc32_fast.c:crc32_arch_optimized
Line
Count
Source
154
1.01M
{
155
  // We will assume that there is at least one byte of input.
156
1.01M
  if (size == 0)
157
110
    return crc;
158
159
  // See crc_clmul_consts_gen.c.
160
1.01M
#if BUILDING_CRC_CLMUL == 32
161
1.01M
  const __m128i fold512 = _mm_set_epi64x(0x1d9513d7, 0x8f352d95);
162
1.01M
  const __m128i fold128 = _mm_set_epi64x(0xccaa009e, 0xae689191);
163
1.01M
  const __m128i mu_p = _mm_set_epi64x(
164
1.01M
    (int64_t)0xb4e5b025f7011641, 0x1db710640);
165
#else
166
  const __m128i fold512 = _mm_set_epi64x(
167
    (int64_t)0x081f6054a7842df4, (int64_t)0x6ae3efbb9dd441f3);
168
169
  const __m128i fold128 = _mm_set_epi64x(
170
    (int64_t)0xdabe95afc7875f40, (int64_t)0xe05dd497ca393ae4);
171
172
  const __m128i mu_p = _mm_set_epi64x(
173
    (int64_t)0x9c3e466c172963d5, (int64_t)0x92d8af2baf0e1e84);
174
#endif
175
176
1.01M
  __m128i v0, v1, v2, v3;
177
178
1.01M
  crc = ~crc;
179
180
1.01M
  if (size < 8) {
181
340k
    uint64_t x = crc;
182
340k
    size_t i = 0;
183
184
    // Checking the bit instead of comparing the size means
185
    // that we don't need to update the size between the steps.
186
340k
    if (size & 4) {
187
217k
      x ^= read32le(buf);
188
217k
      buf += 4;
189
217k
      i = 32;
190
217k
    }
191
192
340k
    if (size & 2) {
193
234k
      x ^= (uint64_t)read16le(buf) << i;
194
234k
      buf += 2;
195
234k
      i += 16;
196
234k
    }
197
198
340k
    if (size & 1)
199
655
      x ^= (uint64_t)*buf << i;
200
201
340k
    v0 = my_set_low64((int64_t)x);
202
340k
    v0 = shift_left(v0, 8 - size);
203
204
671k
  } else if (size < 16) {
205
668k
    v0 = my_set_low64((int64_t)(crc ^ read64le(buf)));
206
207
    // NOTE: buf is intentionally left 8 bytes behind so that
208
    // we can read the last 1-7 bytes with read64le(buf + size).
209
668k
    size -= 8;
210
211
    // Handling 8-byte input specially is a speed optimization
212
    // as the clmul can be skipped. A branch is also needed to
213
    // avoid a too high shift amount.
214
668k
    if (size > 0) {
215
6.96k
      const size_t padding = 8 - size;
216
6.96k
      uint64_t high = read64le(buf + size) >> (padding * 8);
217
218
#if defined(__i386__) || defined(_M_IX86)
219
      // Simple but likely not the best code for 32-bit x86.
220
      v0 = _mm_insert_epi32(v0, (int32_t)high, 2);
221
      v0 = _mm_insert_epi32(v0, (int32_t)(high >> 32), 3);
222
#else
223
6.96k
      v0 = _mm_insert_epi64(v0, (int64_t)high, 1);
224
6.96k
#endif
225
226
6.96k
      v0 = shift_left(v0, padding);
227
228
6.96k
      v1 = _mm_srli_si128(v0, 8);
229
6.96k
      v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
230
6.96k
      v0 = _mm_xor_si128(v0, v1);
231
6.96k
    }
232
668k
  } else {
233
3.83k
    v0 = my_set_low64((int64_t)crc);
234
235
    // To align or not to align the buf pointer? If the end of
236
    // the buffer isn't aligned, aligning the pointer here would
237
    // make us do an extra folding step with the associated byte
238
    // shuffling overhead. The cost of that would need to be
239
    // lower than the benefit of aligned reads. Testing on an old
240
    // Intel Ivy Bridge processor suggested that aligning isn't
241
    // worth the cost but it likely depends on the processor and
242
    // buffer size. Unaligned loads (MOVDQU) should be fast on
243
    // x86 processors that support PCLMULQDQ, so we don't align
244
    // the buf pointer here.
245
246
    // Read the first (and possibly the only) full 16 bytes.
247
3.83k
    v0 = _mm_xor_si128(v0, my_load128(buf));
248
3.83k
    buf += 16;
249
3.83k
    size -= 16;
250
251
3.83k
    if (size >= 48) {
252
109
      v1 = my_load128(buf);
253
109
      v2 = my_load128(buf + 16);
254
109
      v3 = my_load128(buf + 32);
255
109
      buf += 48;
256
109
      size -= 48;
257
258
655
      while (size >= 64) {
259
546
        v0 = fold_xor(v0, fold512, buf);
260
546
        v1 = fold_xor(v1, fold512, buf + 16);
261
546
        v2 = fold_xor(v2, fold512, buf + 32);
262
546
        v3 = fold_xor(v3, fold512, buf + 48);
263
546
        buf += 64;
264
546
        size -= 64;
265
546
      }
266
267
109
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
268
109
      v0 = _mm_xor_si128(v2, fold(v0, fold128));
269
109
      v0 = _mm_xor_si128(v3, fold(v0, fold128));
270
109
    }
271
272
5.88k
    while (size >= 16) {
273
2.05k
      v0 = fold_xor(v0, fold128, buf);
274
2.05k
      buf += 16;
275
2.05k
      size -= 16;
276
2.05k
    }
277
278
3.83k
    if (size > 0) {
279
      // We want the last "size" number of input bytes to
280
      // be at the high bits of v1. First do a full 16-byte
281
      // load and then mask the low bytes to zeros.
282
876
      v1 = my_load128(buf + size - 16);
283
876
      v1 = keep_high_bytes(v1, size);
284
285
      // Shift high bytes from v0 to the low bytes of v1.
286
      //
287
      // Alternatively we could replace the combination
288
      // keep_high_bytes + shift_right + _mm_or_si128 with
289
      // _mm_shuffle_epi8 + _mm_blendv_epi8 but that would
290
      // require larger tables for the masks. Now there are
291
      // three loads (instead of two) from the mask tables
292
      // but they all are from the same cache line.
293
876
      v1 = _mm_or_si128(v1, shift_right(v0, size));
294
295
      // Shift high bytes of v0 away, padding the
296
      // low bytes with zeros.
297
876
      v0 = shift_left(v0, 16 - size);
298
299
876
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
300
876
    }
301
302
3.83k
    v1 = _mm_srli_si128(v0, 8);
303
3.83k
    v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
304
3.83k
    v0 = _mm_xor_si128(v0, v1);
305
3.83k
  }
306
307
  // Barrett reduction
308
309
1.01M
#if BUILDING_CRC_CLMUL == 32
310
1.01M
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
311
1.01M
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
312
1.01M
  v0 = _mm_xor_si128(v0, v1);
313
1.01M
  return ~(uint32_t)_mm_extract_epi32(v0, 2);
314
#else
315
  // Because p is 65 bits but one bit doesn't fit into the 64-bit
316
  // half of __m128i, finish the second clmul by shifting v1 left
317
  // by 64 bits and xorring it to the final result.
318
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
319
  v2 = _mm_slli_si128(v1, 8);
320
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
321
  v0 = _mm_xor_si128(v0, v2);
322
  v0 = _mm_xor_si128(v0, v1);
323
#if defined(__i386__) || defined(_M_IX86)
324
  return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
325
      (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
326
#else
327
  return ~(uint64_t)_mm_extract_epi64(v0, 1);
328
#endif
329
#endif
330
1.01M
}
crc64_fast.c:crc64_arch_optimized
Line
Count
Source
154
8.37k
{
155
  // We will assume that there is at least one byte of input.
156
8.37k
  if (size == 0)
157
0
    return crc;
158
159
  // See crc_clmul_consts_gen.c.
160
#if BUILDING_CRC_CLMUL == 32
161
  const __m128i fold512 = _mm_set_epi64x(0x1d9513d7, 0x8f352d95);
162
  const __m128i fold128 = _mm_set_epi64x(0xccaa009e, 0xae689191);
163
  const __m128i mu_p = _mm_set_epi64x(
164
    (int64_t)0xb4e5b025f7011641, 0x1db710640);
165
#else
166
8.37k
  const __m128i fold512 = _mm_set_epi64x(
167
8.37k
    (int64_t)0x081f6054a7842df4, (int64_t)0x6ae3efbb9dd441f3);
168
169
8.37k
  const __m128i fold128 = _mm_set_epi64x(
170
8.37k
    (int64_t)0xdabe95afc7875f40, (int64_t)0xe05dd497ca393ae4);
171
172
8.37k
  const __m128i mu_p = _mm_set_epi64x(
173
8.37k
    (int64_t)0x9c3e466c172963d5, (int64_t)0x92d8af2baf0e1e84);
174
8.37k
#endif
175
176
8.37k
  __m128i v0, v1, v2, v3;
177
178
8.37k
  crc = ~crc;
179
180
8.37k
  if (size < 8) {
181
2.75k
    uint64_t x = crc;
182
2.75k
    size_t i = 0;
183
184
    // Checking the bit instead of comparing the size means
185
    // that we don't need to update the size between the steps.
186
2.75k
    if (size & 4) {
187
1.64k
      x ^= read32le(buf);
188
1.64k
      buf += 4;
189
1.64k
      i = 32;
190
1.64k
    }
191
192
2.75k
    if (size & 2) {
193
1.49k
      x ^= (uint64_t)read16le(buf) << i;
194
1.49k
      buf += 2;
195
1.49k
      i += 16;
196
1.49k
    }
197
198
2.75k
    if (size & 1)
199
1.27k
      x ^= (uint64_t)*buf << i;
200
201
2.75k
    v0 = my_set_low64((int64_t)x);
202
2.75k
    v0 = shift_left(v0, 8 - size);
203
204
5.61k
  } else if (size < 16) {
205
1.44k
    v0 = my_set_low64((int64_t)(crc ^ read64le(buf)));
206
207
    // NOTE: buf is intentionally left 8 bytes behind so that
208
    // we can read the last 1-7 bytes with read64le(buf + size).
209
1.44k
    size -= 8;
210
211
    // Handling 8-byte input specially is a speed optimization
212
    // as the clmul can be skipped. A branch is also needed to
213
    // avoid a too high shift amount.
214
1.44k
    if (size > 0) {
215
1.08k
      const size_t padding = 8 - size;
216
1.08k
      uint64_t high = read64le(buf + size) >> (padding * 8);
217
218
#if defined(__i386__) || defined(_M_IX86)
219
      // Simple but likely not the best code for 32-bit x86.
220
      v0 = _mm_insert_epi32(v0, (int32_t)high, 2);
221
      v0 = _mm_insert_epi32(v0, (int32_t)(high >> 32), 3);
222
#else
223
1.08k
      v0 = _mm_insert_epi64(v0, (int64_t)high, 1);
224
1.08k
#endif
225
226
1.08k
      v0 = shift_left(v0, padding);
227
228
1.08k
      v1 = _mm_srli_si128(v0, 8);
229
1.08k
      v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
230
1.08k
      v0 = _mm_xor_si128(v0, v1);
231
1.08k
    }
232
4.17k
  } else {
233
4.17k
    v0 = my_set_low64((int64_t)crc);
234
235
    // To align or not to align the buf pointer? If the end of
236
    // the buffer isn't aligned, aligning the pointer here would
237
    // make us do an extra folding step with the associated byte
238
    // shuffling overhead. The cost of that would need to be
239
    // lower than the benefit of aligned reads. Testing on an old
240
    // Intel Ivy Bridge processor suggested that aligning isn't
241
    // worth the cost but it likely depends on the processor and
242
    // buffer size. Unaligned loads (MOVDQU) should be fast on
243
    // x86 processors that support PCLMULQDQ, so we don't align
244
    // the buf pointer here.
245
246
    // Read the first (and possibly the only) full 16 bytes.
247
4.17k
    v0 = _mm_xor_si128(v0, my_load128(buf));
248
4.17k
    buf += 16;
249
4.17k
    size -= 16;
250
251
4.17k
    if (size >= 48) {
252
2.62k
      v1 = my_load128(buf);
253
2.62k
      v2 = my_load128(buf + 16);
254
2.62k
      v3 = my_load128(buf + 32);
255
2.62k
      buf += 48;
256
2.62k
      size -= 48;
257
258
24.6k
      while (size >= 64) {
259
22.0k
        v0 = fold_xor(v0, fold512, buf);
260
22.0k
        v1 = fold_xor(v1, fold512, buf + 16);
261
22.0k
        v2 = fold_xor(v2, fold512, buf + 32);
262
22.0k
        v3 = fold_xor(v3, fold512, buf + 48);
263
22.0k
        buf += 64;
264
22.0k
        size -= 64;
265
22.0k
      }
266
267
2.62k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
268
2.62k
      v0 = _mm_xor_si128(v2, fold(v0, fold128));
269
2.62k
      v0 = _mm_xor_si128(v3, fold(v0, fold128));
270
2.62k
    }
271
272
8.72k
    while (size >= 16) {
273
4.55k
      v0 = fold_xor(v0, fold128, buf);
274
4.55k
      buf += 16;
275
4.55k
      size -= 16;
276
4.55k
    }
277
278
4.17k
    if (size > 0) {
279
      // We want the last "size" number of input bytes to
280
      // be at the high bits of v1. First do a full 16-byte
281
      // load and then mask the low bytes to zeros.
282
3.77k
      v1 = my_load128(buf + size - 16);
283
3.77k
      v1 = keep_high_bytes(v1, size);
284
285
      // Shift high bytes from v0 to the low bytes of v1.
286
      //
287
      // Alternatively we could replace the combination
288
      // keep_high_bytes + shift_right + _mm_or_si128 with
289
      // _mm_shuffle_epi8 + _mm_blendv_epi8 but that would
290
      // require larger tables for the masks. Now there are
291
      // three loads (instead of two) from the mask tables
292
      // but they all are from the same cache line.
293
3.77k
      v1 = _mm_or_si128(v1, shift_right(v0, size));
294
295
      // Shift high bytes of v0 away, padding the
296
      // low bytes with zeros.
297
3.77k
      v0 = shift_left(v0, 16 - size);
298
299
3.77k
      v0 = _mm_xor_si128(v1, fold(v0, fold128));
300
3.77k
    }
301
302
4.17k
    v1 = _mm_srli_si128(v0, 8);
303
4.17k
    v0 = _mm_clmulepi64_si128(v0, fold128, 0x10);
304
4.17k
    v0 = _mm_xor_si128(v0, v1);
305
4.17k
  }
306
307
  // Barrett reduction
308
309
#if BUILDING_CRC_CLMUL == 32
310
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
311
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
312
  v0 = _mm_xor_si128(v0, v1);
313
  return ~(uint32_t)_mm_extract_epi32(v0, 2);
314
#else
315
  // Because p is 65 bits but one bit doesn't fit into the 64-bit
316
  // half of __m128i, finish the second clmul by shifting v1 left
317
  // by 64 bits and xorring it to the final result.
318
8.37k
  v1 = _mm_clmulepi64_si128(v0, mu_p, 0x10); // v0 * mu
319
8.37k
  v2 = _mm_slli_si128(v1, 8);
320
8.37k
  v1 = _mm_clmulepi64_si128(v1, mu_p, 0x00); // v1 * p
321
8.37k
  v0 = _mm_xor_si128(v0, v2);
322
8.37k
  v0 = _mm_xor_si128(v0, v1);
323
#if defined(__i386__) || defined(_M_IX86)
324
  return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
325
      (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
326
#else
327
8.37k
  return ~(uint64_t)_mm_extract_epi64(v0, 1);
328
8.37k
#endif
329
8.37k
#endif
330
8.37k
}
331
332
333
// Even though this is an inline function, compile it only when needed.
334
// This way it won't appear in E2K builds at all.
335
#if defined(CRC32_GENERIC) || defined(CRC64_GENERIC)
336
// Inlining this function duplicates the function body in crc32_resolve() and
337
// crc64_resolve(), but this is acceptable because this is a tiny function.
338
static inline bool
339
is_arch_extension_supported(void)
340
12
{
341
12
  int success = 1;
342
12
  uint32_t r[4]; // eax, ebx, ecx, edx
343
344
#if defined(_MSC_VER)
345
  // This needs <intrin.h> with MSVC. ICC has it as a built-in
346
  // on all platforms.
347
  __cpuid(r, 1);
348
#elif defined(HAVE_CPUID_H)
349
  // Compared to just using __asm__ to run CPUID, this also checks
350
  // that CPUID is supported and saves and restores ebx as that is
351
  // needed with GCC < 5 with position-independent code (PIC).
352
12
  success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
353
#else
354
  // Just a fallback that shouldn't be needed.
355
  __asm__("cpuid\n\t"
356
      : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
357
      : "a"(1), "c"(0));
358
#endif
359
360
  // Returns true if these are supported:
361
  // CLMUL (bit 1 in ecx)
362
  // SSSE3 (bit 9 in ecx)
363
  // SSE4.1 (bit 19 in ecx)
364
12
  const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
365
12
  return success && (r[2] & ecx_mask) == ecx_mask;
366
367
  // Alternative methods that weren't used:
368
  //   - ICC's _may_i_use_cpu_feature: the other methods should work too.
369
  //   - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
370
  //
371
  // CPUID decoding is needed with MSVC anyway and older GCC. This keeps
372
  // the feature checks in the build system simpler too. The nice thing
373
  // about __builtin_cpu_supports would be that it generates very short
374
  // code as is it only reads a variable set at startup but a few bytes
375
  // doesn't matter here.
376
12
}
crc32_fast.c:is_arch_extension_supported
Line
Count
Source
340
6
{
341
6
  int success = 1;
342
6
  uint32_t r[4]; // eax, ebx, ecx, edx
343
344
#if defined(_MSC_VER)
345
  // This needs <intrin.h> with MSVC. ICC has it as a built-in
346
  // on all platforms.
347
  __cpuid(r, 1);
348
#elif defined(HAVE_CPUID_H)
349
  // Compared to just using __asm__ to run CPUID, this also checks
350
  // that CPUID is supported and saves and restores ebx as that is
351
  // needed with GCC < 5 with position-independent code (PIC).
352
6
  success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
353
#else
354
  // Just a fallback that shouldn't be needed.
355
  __asm__("cpuid\n\t"
356
      : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
357
      : "a"(1), "c"(0));
358
#endif
359
360
  // Returns true if these are supported:
361
  // CLMUL (bit 1 in ecx)
362
  // SSSE3 (bit 9 in ecx)
363
  // SSE4.1 (bit 19 in ecx)
364
6
  const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
365
6
  return success && (r[2] & ecx_mask) == ecx_mask;
366
367
  // Alternative methods that weren't used:
368
  //   - ICC's _may_i_use_cpu_feature: the other methods should work too.
369
  //   - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
370
  //
371
  // CPUID decoding is needed with MSVC anyway and older GCC. This keeps
372
  // the feature checks in the build system simpler too. The nice thing
373
  // about __builtin_cpu_supports would be that it generates very short
374
  // code as is it only reads a variable set at startup but a few bytes
375
  // doesn't matter here.
376
6
}
crc64_fast.c:is_arch_extension_supported
Line
Count
Source
340
6
{
341
6
  int success = 1;
342
6
  uint32_t r[4]; // eax, ebx, ecx, edx
343
344
#if defined(_MSC_VER)
345
  // This needs <intrin.h> with MSVC. ICC has it as a built-in
346
  // on all platforms.
347
  __cpuid(r, 1);
348
#elif defined(HAVE_CPUID_H)
349
  // Compared to just using __asm__ to run CPUID, this also checks
350
  // that CPUID is supported and saves and restores ebx as that is
351
  // needed with GCC < 5 with position-independent code (PIC).
352
6
  success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
353
#else
354
  // Just a fallback that shouldn't be needed.
355
  __asm__("cpuid\n\t"
356
      : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
357
      : "a"(1), "c"(0));
358
#endif
359
360
  // Returns true if these are supported:
361
  // CLMUL (bit 1 in ecx)
362
  // SSSE3 (bit 9 in ecx)
363
  // SSE4.1 (bit 19 in ecx)
364
6
  const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
365
6
  return success && (r[2] & ecx_mask) == ecx_mask;
366
367
  // Alternative methods that weren't used:
368
  //   - ICC's _may_i_use_cpu_feature: the other methods should work too.
369
  //   - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
370
  //
371
  // CPUID decoding is needed with MSVC anyway and older GCC. This keeps
372
  // the feature checks in the build system simpler too. The nice thing
373
  // about __builtin_cpu_supports would be that it generates very short
374
  // code as is it only reads a variable set at startup but a few bytes
375
  // doesn't matter here.
376
6
}
377
#endif